Commit c3215b87 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'matrix_redistribute' into 'master_pre_stage'

Matrix redistribute

See merge request elpa/elpa!31
parents 52b3ce2a 2a9f9fa8
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,6 +2,11 @@ Changelog for upcoming release
- not yet decided
Changelog for ELPA 2020.05.001
- improved documentation, including fixing of typos and errors in markdown
- Fix a bug in the calling of Cannons algorithm which might lead to crashes
for a squared process grid
Changelog for ELPA 2019.11.001
- solve a bug when using parallel make builds
......
......@@ -59,7 +59,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/general/mod_elpa_skewsymmetric_blas.F90 \
src/elpa_index.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
libelpa@SUFFIX@_private_la_SOURCES += \
......@@ -74,6 +74,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1/elpa1_compute_template.F90 \
src/elpa2/elpa2_compute_real_template.F90 \
src/elpa2/elpa2_compute_complex_template.F90 \
src/helpers/elpa_redistribute_template.F90 \
src/elpa1/elpa1_template.F90 \
src/elpa2/elpa2_template.F90 \
src/elpa2/qr/qr_utils_template.F90 \
......@@ -95,6 +96,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/compute_hh_trafo.F90 \
src/elpa2/redist_band.F90 \
src/general/sanity.F90 \
src/general/error_checking.inc \
src/elpa1/elpa_cholesky_template.F90 \
src/elpa1/elpa_invert_trm.F90 \
src/elpa1/elpa_multiply_a_b.F90 \
......@@ -122,8 +124,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu
EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES += src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif
if !WITH_MPI
......@@ -214,16 +215,16 @@ endif
endif
if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_2hv_single_precision.c
endif
else
endif
if WITH_REAL_AVX2_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
endif
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_2hv_single_precision.c
endif
endif
......@@ -263,16 +264,16 @@ endif
endif
if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_4hv_single_precision.c
endif
else
endif
if WITH_REAL_AVX2_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
endif
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_4hv_single_precision.c
endif
endif
......@@ -312,16 +313,16 @@ endif
endif
if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx_6hv_single_precision.c
endif
else
endif
if WITH_REAL_AVX2_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
endif
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx2_6hv_single_precision.c
endif
endif
......@@ -354,16 +355,16 @@ endif
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_1hv_single_precision.c
endif
else
endif
if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx2_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
endif
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx2_1hv_single_precision.c
endif
endif
......@@ -397,16 +398,16 @@ endif
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_2hv_single_precision.c
endif
else
endif
if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx2_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
endif
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx2_2hv_single_precision.c
endif
endif
......@@ -673,6 +674,7 @@ EXTRA_DIST = \
src/GPU/cuUtils_template.cu \
src/elpa_api_math_template.F90 \
src/elpa_impl_math_template.F90 \
src/helpers/elpa_redistribute_template.F90 \
src/elpa_impl_generalized_transform_template.F90 \
src/elpa1/elpa1_compute_template.F90 \
src/elpa1/elpa1_merge_systems_real_template.F90 \
......@@ -687,8 +689,6 @@ EXTRA_DIST = \
src/elpa1/elpa_reduce_add_vectors.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/elpa1/elpa_transpose_vectors.F90 \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
src/elpa2/compute_hh_trafo.F90 \
src/elpa2/elpa2_bandred_template.F90 \
src/elpa2/elpa2_compute_complex_template.F90 \
......@@ -724,6 +724,7 @@ EXTRA_DIST = \
test/shared/test_precision_kinds.F90 \
src/general/prow_pcol.F90 \
src/general/sanity.F90 \
src/general/error_checking.inc \
src/general/elpa_ssr2_template.F90 \
src/general/elpa_ssmv_template.F90 \
test/Fortran/assert.h \
......
......@@ -416,6 +416,48 @@ ilp64_no_omp_mpi_tests = [
print("\n".join(ilp64_no_omp_mpi_tests))
#two test for matrix-redistribute
matrix_redistribute_mpi_tests = [
"# gnu-gnu-matrix-redistribute-mpi-noomp",
"gnu-gnu-mpi-noopenmp-matrix-redistribute:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"mpicc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"mpif90\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
"# gnu-gnu-matrix-redistribute-mpi-openmp",
"gnu-gnu-mpi-openmp-matrix-redistribute:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"mpicc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"mpif90\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
]
print("\n".join(matrix_redistribute_mpi_tests))
# add python tests
python_ci_tests = [
"# python tests",
......@@ -661,8 +703,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# add tests for scalapack for some specific test cases
runScalapackTest = False
if (instr == "avx2" and cov == "coverage" and m == "mpi"):
runScalapackTest = True
#if (instr == "avx2" and cov == "coverage" and m == "mpi"):
#if (instr == "avx2" and m == "mpi"):
# runScalapackTest = True
# address-sanitize only with gnu compiler
......@@ -734,9 +777,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
if (MasterOnly):
print(" only:")
print(" - /.*master.*/")
#if (MasterOnly):
# print(" only:")
# print(" - /.*master.*/")
if (instr == "power8"):
print(" allow_failure: true")
print(" tags:")
......@@ -790,7 +833,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if (runScalapackTest):
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
+ libs + " " + ldflags + " " + " "+ scalapackldflags +" " + scalapackfcflags \
+ " --enable-option-checking=fatal --enable-scalapack-tests" + " " + mpi_configure_flag + " " + openmp[o] \
+ " --enable-option-checking=fatal --enable-scalapack-tests --enable-autotune-redistribute-matrix" + " " + mpi_configure_flag + " " + openmp[o] \
+ " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \
+ " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -q \"srun\" -S $SLURM -g " +gpuJob)
......
......@@ -104,30 +104,6 @@ if test x"${with_mpi}" = x"yes"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
dnl C
AC_LANG_PUSH([C])
......@@ -161,11 +137,24 @@ if test x"$c11_standard" = x"no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [
c11_standard=yes
], [
echo "C compiler cannot compile -std=c11 code"
echo "testing -c11.."
])
if test x"$c11_standard" = x"yes"; then
CFLAGS+=" -std=c11"
fi
fi
if test x"$c11_standard" = x"no"; then
AX_CHECK_COMPILE_FLAG([-c11], [
c11_standard=yes
], [
echo "C compiler cannot compile -c11 code"
echo "C compiler cannot compile C11 code"
exit -1
])
if test x"$c11_standard" = x"yes"; then
CFLAGS+=" -std=c11"
CFLAGS+=" -c11"
fi
fi
......@@ -759,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
#m4_define(elpa_m4_gpu_kernels, [
# real_gpu
# complex_gpu
#])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -805,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
#ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -847,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
])
#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
# [Compile and always use the GPU version])],
# [],[with_gpu_support_only=no])
#if test x"$with_gpu_support_only" = x"yes" ; then
# m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
# use_[]elpa_m4_kernel[]=no
# ])
# use_real_gpu=yes
# use_complex_gpu=yes
#fi
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
[Compile and always use the GPU version])],
[],[with_gpu_support_only=no])
if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
fi
dnl
......@@ -958,6 +947,43 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0;
dnl }
AC_LANG_PUSH([C])
dnl check whether one can link against Fortran programs from C
AC_MSG_CHECKING(whether we need _ in C programs to link against a Fortran library)
AC_LINK_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
int m, n, k, lda, ldb, ldc;
double alpha, beta;
double *a, *b, *c;
dgemm_("N", "N", &m, &n, &k, &alpha, a, lda, b, &ldb, &beta, c, &ldc);
}
])],
[can_link_with_=yes],
[can_link_with_=no]
)
AC_MSG_RESULT([${can_link_with_}])
if test x"$can_link_with_" = x"yes"; then
AC_DEFINE([NEED_UNDERSCORE_TO_LINK_AGAINST_FORTRAN],[1],[need to append an underscore])
fi
AC_MSG_CHECKING(whether we do not need _ in C programs to link against a Fortran library)
AC_LINK_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
int m, n, k, lda, ldb, ldc;
double alpha, beta;
double *a, *b, *c;
dgemm("N", "N", &m, &n, &k, &alpha, a, lda, b, &ldb, &beta, c, &ldc);
}
])],
[can_link_without_=yes],
[can_link_without_=no]
)
AC_MSG_RESULT([${can_link_without_}])
if test x"$can_link_with_" = x"yes"; then
AC_DEFINE([NEED_NO_UNDERSCORE_TO_LINK_AGAINST_FORTRAN],[1],[need not to append an underscore])
fi
if test x"${need_vsx}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
......@@ -1318,7 +1344,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
#AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
......@@ -1411,6 +1437,100 @@ if test x"${enable_autotuning}" = x"yes"; then
AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
AC_MSG_CHECKING(whether C tests should be provided)
AC_ARG_ENABLE([c-tests],
AS_HELP_STRING([--enable-c-tests],
......
......@@ -9,6 +9,16 @@
name = value,
#define ELPA_ENUM_SUM(name, value, ...) +1
/* MATRIX layout */
#define ELPA_FOR_ALL_MATRIX_LAYOUTS(X) \
X(COLUMN_MAJOR_ORDER, 1) \
X(ROW_MAJOR_ORDER, 2)
enum MATRIX_LAYOUTS {
ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_ENTRY)
};
#define ELPA_NUMBER_OF_MATRIX_LAYOUTS (0 ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_SUM))
/* Solver constants */
#define ELPA_FOR_ALL_SOLVERS(X) \
......
This diff is collapsed.
This diff is collapsed.
......@@ -51,9 +51,10 @@ module cuda_functions
integer(kind=ik) :: cudaMemcpyHostToDevice
integer(kind=ik) :: cudaMemcpyDeviceToHost
integer(kind=ik) :: cudaMemcpyDeviceToDevice
integer(kind=ik) :: cudaHostRegisterDefault
integer(kind=ik) :: cudaHostRegisterPortable
integer(kind=ik) :: cudaHostRegisterMapped
integer(kind=ik) :: cudaMemcpyDeviceToDevice
! TODO global variable, has to be changed
integer(kind=C_intptr_T) :: cublasHandle = -1
......@@ -77,7 +78,7 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_create_c
end interface
end interface
interface
function cublas_destroy_c(handle) result(istat) &
......@@ -87,15 +88,6 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_destroy_c
end interface
interface
function cuda_threadsynchronize_c() result(istat) &
bind(C,name="cudaThreadSynchronizeFromC")
use iso_c_binding
implicit none
integer(kind=C_INT) :: istat
end function cuda_threadsynchronize_c
end interface
interface
......@@ -160,6 +152,15 @@ module cuda_functions
end function
end interface
interface
function cuda_hostRegisterDefault_c() result(flag) &
bind(C, name="cudaHostRegisterDefaultFromC")
use iso_c_binding
implicit none
integer(kind=c_int) :: flag
end function
end interface
interface
function cuda_hostRegisterPortable_c() result(flag) &
bind(C, name="cudaHostRegisterPortableFromC")
......@@ -214,6 +215,34 @@ module cuda_functions
end function cuda_memcpy2d_c
end interface
interface
function cuda_host_register_c(a, size, flag) result(istat) &
bind(C, name="cudaHostRegisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=c_intptr_t), intent(in), value :: size
integer(kind=C_INT), intent(in), value :: flag
integer(kind=C_INT) :: istat
end function cuda_host_register_c
end interface
interface
function cuda_host_unregister_c(a) result(istat) &
bind(C, name="cudaHostUnregisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=C_INT) :: istat
end function cuda_host_unregister_c
end interface
! functions to allocate and free CUDA memory
interface
......@@ -243,6 +272,33 @@ module cuda_functions
end function cuda_malloc_c
end interface
interface
function cuda_free_host_c(a) result(istat) &
bind(C, name="cudaFreeHostFromC")
use iso_c_binding
implicit none
type(c_ptr), value :: a
integer(kind=C_INT) :: istat
end function cuda_free_host_c
end interface
interface
function cuda_malloc_host_c(a, width_height) result(istat) &
bind(C, name="cudaMallocHostFromC")
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in), value :: width_height
integer(kind=C_INT) :: istat
end function cuda_malloc_host_c
end interface
interface
function cuda_memset_c(a, val, size) result(istat) &
bind(C, name="cudaMemsetFromC")
......@@ -290,7 +346,7 @@ module cuda_functions
real(kind=C_FLOAT),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, b, c
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_sgemm_c
end interface
......@@ -307,7 +363,7 @@ module cuda_functions
real(kind=C_DOUBLE), value :: alpha
integer(kind=C_intptr_T), value :: a, b
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_dtrmm_c
end interface
......@@ -527,19 +583,6 @@ module cuda_functions
success = .true.
#endif
end function
function cuda_threadsynchronize() result(success)
use iso_c_binding
implicit none
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_threadsynchronize_c() /= 0
#else
success = .true.
#endif
end function cuda_threadsynchronize
function cuda_setdevice(n) result(success)
use iso_c_binding
......@@ -614,6 +657,35 @@ module cuda_functions
#endif
end function cuda_free
function cuda_malloc_host(a, width_height) result(success)
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
logical