Commit 69972fbf authored by Andreas Marek's avatar Andreas Marek

Some cleanup

parent 03d44007
......@@ -46,7 +46,7 @@ AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, w
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20200417], [Current ELPA autotune version])
AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20200417, "Current ELPA autotune version")
AC_DEFINE_UNQUOTED([ELPA_BUILDTIME], [$ELPA_BUILDTIME], ["Time of build"])
AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1604905771],[old_elpa_version=yes],[old_elpa_version=no])
AX_CHECK_GNU_MAKE()
if test x$_cv_gnu_make_command = x ; then
......@@ -1776,6 +1776,9 @@ else
echo "build config should be compiled into the library: no"
fi
if test x"$have_loop_blocking" = x"yes"; then
AC_DEFINE([LOOP_BLOCKING],[1],[use blocking in loops])
fi
AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
......@@ -1986,4 +1989,10 @@ else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
if test x"$old_elpa_version" = x"yes"; then
echo " "
echo " It is possible that your current version of ELPA is not the latest one."
echo " You might want to have a look at https://elpa.mpcdf.mpg.de, whether a more recent"
echo " version has been released already"
echo " "
fi
AC_DEFUN([AX_GCC_VERSION], [
GCC_VERSION=""
echo "calling gcc"
echo $CC
$CC | grep gcc
echo $?
AX_CHECK_COMPILE_FLAG([-dumpversion],
[ax_gcc_version_option=yes],
[ax_gcc_version_option=no])
AS_IF([test "x$GCC" = "xyes"],[
AS_IF([test "x$ax_gcc_version_option" != "xno"],[
AC_CACHE_CHECK([gcc version],[ax_cv_gcc_version],[
ax_cv_gcc_version="`$CC -dumpversion`"
AS_IF([test "x$ax_cv_gcc_version" = "x"],[
ax_cv_gcc_version=""
])
])
GCC_VERSION=$ax_cv_gcc_version
])
])
AC_SUBST([GCC_VERSION])
])
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
#
# DESCRIPTION
#
# This macro compares two version strings. Due to the various number of
# minor-version numbers that can exist, and the fact that string
# comparisons are not compatible with numeric comparisons, this is not
# necessarily trivial to do in a autoconf script. This macro makes doing
# these comparisons easy.
#
# The six basic comparisons are available, as well as checking equality
# limited to a certain number of minor-version levels.
#
# The operator OP determines what type of comparison to do, and can be one
# of:
#
# eq - equal (test A == B)
# ne - not equal (test A != B)
# le - less than or equal (test A <= B)
# ge - greater than or equal (test A >= B)
# lt - less than (test A < B)
# gt - greater than (test A > B)
#
# Additionally, the eq and ne operator can have a number after it to limit
# the test to that number of minor versions.
#
# eq0 - equal up to the length of the shorter version
# ne0 - not equal up to the length of the shorter version
# eqN - equal up to N sub-version levels
# neN - not equal up to N sub-version levels
#
# When the condition is true, shell commands ACTION-IF-TRUE are run,
# otherwise shell commands ACTION-IF-FALSE are run. The environment
# variable 'ax_compare_version' is always set to either 'true' or 'false'
# as well.
#
# Examples:
#
# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8])
# AX_COMPARE_VERSION([3.15],[lt],[3.15.8])
#
# would both be true.
#
# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8])
# AX_COMPARE_VERSION([3.15],[gt],[3.15.8])
#
# would both be false.
#
# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8])
#
# would be true because it is only comparing two minor versions.
#
# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15])
#
# would be true because it is only comparing the lesser number of minor
# versions of the two values.
#
# Note: The characters that separate the version numbers do not matter. An
# empty string is the same as version 0. OP is evaluated by autoconf, not
# configure, so must be a string, not a variable.
#
# The author would like to acknowledge Guido Draheim whose advice about
# the m4_case and m4_ifvaln functions make this macro only include the
# portions necessary to perform the specific comparison specified by the
# OP argument in the final configure script.
#
# LICENSE
#
# Copyright (c) 2008 Tim Toolan <toolan@ele.uri.edu>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 13
dnl #########################################################################
AC_DEFUN([AX_COMPARE_VERSION], [
AC_REQUIRE([AC_PROG_AWK])
# Used to indicate true or false condition
ax_compare_version=false
# Convert the two version strings to be compared into a format that
# allows a simple string comparison. The end result is that a version
# string of the form 1.12.5-r617 will be converted to the form
# 0001001200050617. In other words, each number is zero padded to four
# digits, and non digits are removed.
AS_VAR_PUSHDEF([A],[ax_compare_version_A])
A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
AS_VAR_PUSHDEF([B],[ax_compare_version_B])
B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary
dnl # then the first line is used to determine if the condition is true.
dnl # The sed right after the echo is to remove any indented white space.
m4_case(m4_tolower($2),
[lt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[gt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[le],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],
[ge],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],[
dnl Split the operator from the subversion count if present.
m4_bmatch(m4_substr($2,2),
[0],[
# A count of zero means use the length of the shorter version.
# Determine the number of characters in A and B.
ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'`
ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'`
# Set A to no more than B's length and B to no more than A's length.
A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"`
B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"`
],
[[0-9]+],[
# A count greater than zero means use only that many subversions
A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
],
[.+],[
AC_WARNING(
[invalid OP numeric parameter: $2])
],[])
# Pad zeros at end of numbers to make same length.
ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`"
B="$B`echo $A | sed 's/./0/g'`"
A="$ax_compare_version_tmp_A"
# Check for equality or inequality as necessary.
m4_case(m4_tolower(m4_substr($2,0,2)),
[eq],[
test "x$A" = "x$B" && ax_compare_version=true
],
[ne],[
test "x$A" != "x$B" && ax_compare_version=true
],[
AC_WARNING([invalid OP parameter: $2])
])
])
AS_VAR_POPDEF([A])dnl
AS_VAR_POPDEF([B])dnl
dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE.
if test "$ax_compare_version" = "true" ; then
have_loop_blocking=yes
m4_ifvaln([$4],[$4],[:])dnl
m4_ifvaln([$5],[else $5])dnl
fi
]) dnl AX_COMPARE_VERSION
......@@ -54,8 +54,8 @@
#include "../general/sanity.F90"
subroutine merge_systems_&
&PRECISION &
subroutine merge_systems_&
&PRECISION &
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
use cuda_functions
......
......@@ -338,12 +338,12 @@ subroutine solve_tridi_&
end subroutine merge_recursive_&
&PRECISION
end subroutine solve_tridi_&
&PRECISION_AND_SUFFIX
end subroutine solve_tridi_&
&PRECISION_AND_SUFFIX
subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX &
( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )
subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX &
( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
......@@ -554,12 +554,12 @@ subroutine solve_tridi_&
call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
end subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX
end subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX
subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX &
(obj, nlen, d, e, q, ldq, wantDebug, success)
subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX &
(obj, nlen, d, e, q, ldq, wantDebug, success)
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
......@@ -673,6 +673,6 @@ subroutine solve_tridi_&
enddo
call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
end subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX
end subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX
......@@ -56,9 +56,9 @@
#if REALCASE == 1
subroutine v_add_s_&
&PRECISION&
&(obj, v,n,s)
subroutine v_add_s_&
&PRECISION&
&(obj, v,n,s)
use precision
use elpa_abstract_impl
implicit none
......@@ -68,12 +68,12 @@
real(kind=rk) :: v(n),s
v(:) = v(:) + s
end subroutine v_add_s_&
&PRECISION
end subroutine v_add_s_&
&PRECISION
subroutine distribute_global_column_&
&PRECISION&
&(obj, g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
subroutine distribute_global_column_&
&PRECISION&
&(obj, g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
use precision
use elpa_abstract_impl
implicit none
......@@ -101,60 +101,60 @@
l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
enddo
end subroutine distribute_global_column_&
&PRECISION
subroutine solve_secular_equation_&
&PRECISION&
&(obj, n, i, d, z, delta, rho, dlam)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
!
! 1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
!
! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
! which is more robust (it always yields a solution) but also slower
! than the algorithm used in DLAED4.
!
! The same restictions than in DLAED4 hold, namely:
!
! rho > 0 and d(i+1) > d(i)
!
! but this routine will not terminate with error if these are not satisfied
! (it will normally converge to a pole in this case).
!
! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
! N=1 and N=2 which is not compatible with DLAED4.
! Thus this routine shouldn't be used for these cases as a simple replacement
! of DLAED4.
!
! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
!
!
! N (input) INTEGER
! The length of all arrays.
!
! I (input) INTEGER
! The index of the eigenvalue to be computed. 1 <= I <= N.
!
! D (input) DOUBLE PRECISION array, dimension (N)
! The original eigenvalues. It is assumed that they are in
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating Vector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
! See remark above about DLAED4 compatibility!
!
! RHO (input) DOUBLE PRECISION
! The scalar in the symmetric updating formula.
!
! DLAM (output) DOUBLE PRECISION
! The computed lambda_I, the I-th updated eigenvalue.
!-------------------------------------------------------------------------------
end subroutine distribute_global_column_&
&PRECISION
subroutine solve_secular_equation_&
&PRECISION&
&(obj, n, i, d, z, delta, rho, dlam)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
!
! 1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
!
! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
! which is more robust (it always yields a solution) but also slower
! than the algorithm used in DLAED4.
!
! The same restictions than in DLAED4 hold, namely:
!
! rho > 0 and d(i+1) > d(i)
!
! but this routine will not terminate with error if these are not satisfied
! (it will normally converge to a pole in this case).
!
! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
! N=1 and N=2 which is not compatible with DLAED4.
! Thus this routine shouldn't be used for these cases as a simple replacement
! of DLAED4.
!
! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
!
!
! N (input) INTEGER
! The length of all arrays.
!
! I (input) INTEGER
! The index of the eigenvalue to be computed. 1 <= I <= N.
!
! D (input) DOUBLE PRECISION array, dimension (N)
! The original eigenvalues. It is assumed that they are in
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating Vector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
! See remark above about DLAED4 compatibility!
!
! RHO (input) DOUBLE PRECISION
! The scalar in the symmetric updating formula.
!
! DLAM (output) DOUBLE PRECISION
! The computed lambda_I, the I-th updated eigenvalue.
!-------------------------------------------------------------------------------
use precision
use elpa_abstract_impl
......@@ -238,19 +238,19 @@
delta(:) = delta(:) - x
call obj%timer%stop("solve_secular_equation" // PRECISION_SUFFIX)
end subroutine solve_secular_equation_&
&PRECISION
!-------------------------------------------------------------------------------
end subroutine solve_secular_equation_&
&PRECISION
!-------------------------------------------------------------------------------
#endif
#if REALCASE == 1
subroutine hh_transform_real_&
subroutine hh_transform_real_&
#endif
#if COMPLEXCASE == 1
subroutine hh_transform_complex_&
subroutine hh_transform_complex_&
#endif
&PRECISION &
(obj, alpha, xnorm_sq, xf, tau, wantDebug)
&PRECISION &
(obj, alpha, xnorm_sq, xf, tau, wantDebug)
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
#endif
......@@ -353,7 +353,7 @@
&PRECISION_SUFFIX )
#if REALCASE == 1
end subroutine hh_transform_real_&
end subroutine hh_transform_real_&
#endif
#if COMPLEXCASE == 1
end subroutine hh_transform_complex_&
......
......@@ -88,11 +88,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
use precision
......@@ -553,7 +553,7 @@
&PRECISION_SUFFIX // &
gpuString )
end subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION
end subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION
......@@ -882,7 +882,7 @@ subroutine tridiag_&
if (useGPU) then
if(.not. mat_vec_as_one_block) then
if (.not. mat_vec_as_one_block) then
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
if (wantDebug) call obj%timer%start("cublas")
......@@ -911,7 +911,7 @@ subroutine tridiag_&
enddo
if (useGPU) then
if(mat_vec_as_one_block) then
if (mat_vec_as_one_block) then
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
if (wantDebug) call obj%timer%start("cublas")
......@@ -970,7 +970,7 @@ subroutine tridiag_&
if (my_pcol==pcol(2, nblk, np_cols)) then
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
if (useGPU) then
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
......@@ -987,8 +987,6 @@ subroutine tridiag_&
#if COMPLEXCASE == 1
e_vec(1) = real(vrl,kind=rk)
#endif
a_mat(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
......@@ -1008,7 +1006,7 @@ subroutine tridiag_&
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
if (useGPU) then
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
......@@ -1024,7 +1022,7 @@ subroutine tridiag_&
! Store e_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
if (useGPU) then
successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
......@@ -1145,7 +1143,7 @@ subroutine tridiag_&
PRECISION_SUFFIX // &
gpuString )
end subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION
end subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment