Commit 85449317 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

New keywords for GPU in ELPA index

See merge request !63
parents 197666fe ab8bfde4
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4,6 +4,9 @@ Changelog for next release
Upcoming changes for ELPA 2021.05.001
- allow the user to set the mapping of MPI tasks to GPU id per set/get
- experimental feature: port to AMD GPUS, works correctly, performance yet
unclear; only tested --with-mpi=0
- On request, ELPA can print the pinning of MPI tasks and OpenMP thread
Changelog for ELPA 2020.11.001
......
......@@ -45,8 +45,13 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/elpa2_compute.F90 \
src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
src/GPU/check_for_gpu.F90 \
src/GPU/mod_cuda.F90 \
src/elpa2/GPU/interface_c_kernel.F90 \
src/GPU/mod_vendor_agnostic_layer.F90 \
src/GPU/CUDA/mod_cuda.F90 \
src/GPU/INTEL/mod_mkl_offload.F90 \
src/GPU/ROCm/mod_hip.F90 \
src/elpa2/GPU/interface_c_gpu_kernel.F90 \
src/elpa2/GPU/CUDA/interface_c_cuda_kernel.F90 \
src/elpa2/GPU/ROCm/interface_c_hip_kernel.F90 \
src/elpa2/mod_pack_unpack_gpu.F90 \
src/elpa2/qr/qr_utils.F90 \
src/elpa2/qr/elpa_qrkernels.F90 \
......@@ -137,8 +142,16 @@ libelpa@SUFFIX@_private_la_SOURCES += \
src/helpers/timer_dummy.F90
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/elpa_index_gpu.cu src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
if WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu
endif
if WITH_INTEL_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/INTEL/mkl_offload.cpp
endif
if WITH_AMD_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_real.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_complex.cpp
endif
if !WITH_MPI
......@@ -555,6 +568,7 @@ if STORE_BUILD_CONFIG
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
endif
# rule for ROCm files
# Cuda files
.cu.lo:
......@@ -613,7 +627,9 @@ dist_man_MANS = \
man/elpa_autotune_deallocate.3 \
man/elpa_uninit.3
dist_doc_DATA = README.md USERS_GUIDE.md USERS_GUIDE_DEPRECATED_LEGACY_API.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
dist_doc_DATA = README.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt documentation/INSTALL.md \
documentation/PERFORMANCE_TUNING.md documentation/SWITCHING_TO_NEW_INTERFACE.md documentation/USERS_GUIDE.md \
documentation/USERS_GUIDE_DEPRECATED_LEGACY_API.md documentation/plots/mpi_elpa1.png documentation/plots/mpi_elpa2.png
# pkg-config stuff
pkgconfigdir = $(libdir)/pkgconfig
......@@ -775,11 +791,13 @@ CLEANFILES = \
clean-local:
-rm -rf modules/* private_modules/* test_modules/* .fortran_dependencies/*
-rm -rf validate_*.sh
-rm -rf test_*.sh
-rm -rf real_2stage*.sh
-rm -rf complex_2stage*.sh
-rm -rf single_complex_2stage*.sh
-rm -rf single_real_2stage*.sh
-rm -rf double_instance_onenode*.sh
-rm -rf test_*.sh
-rm -rf $(generated_headers)
distclean-local:
......@@ -806,7 +824,8 @@ EXTRA_DIST = \
remove_xcompiler \
src/helpers/fortran_blas_interfaces.F90 \
src/helpers/fortran_scalapack_interfaces.F90 \
src/GPU/cuUtils_template.cu \
src/GPU/CUDA/cuUtils_template.cu \
src/GPU/ROCm/hipUtils_template.cpp \
src/elpa_api_math_template.F90 \
src/elpa_impl_math_template.F90 \
src/helpers/elpa_redistribute_template.F90 \
......
......@@ -125,12 +125,12 @@ configure *ELPA*):
- Scalapack routines
- a working MPI library
Please refer to the [INSTALL document](INSTALL.md) on details of the installation process and
Please refer to the [INSTALL document](./documentation/INSTALL.md) on details of the installation process and
the possible configure options.
## Using *ELPA*
Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html) doxygen documentation, where you find the definition of the interfaces.
Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html) doxygen documentation, where you find the definition of the interfaces. You might want to have a look at the [PERFORMANCE tuning document](./documentation/PERFORMANCE_TUNING.md) to avoid some usual pitfalls.
## Contributing to *ELPA*
......
......@@ -346,7 +346,7 @@ ilp64_no_omp_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -364,7 +364,7 @@ ilp64_no_omp_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -388,7 +388,7 @@ ilp64_no_omp_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -406,7 +406,7 @@ ilp64_no_omp_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -430,7 +430,7 @@ matrix_redistribute_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -448,7 +448,7 @@ matrix_redistribute_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -472,7 +472,7 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -486,7 +486,7 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP\\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP\\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m 150 -n 50 -b 16 '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -496,13 +496,13 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests "'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests "'
'-d " CC=\\\\\\\"mpiicc\\\\\\\" CFLAGS=\\\\\\\"-O3 -xAVX\\\\\\\" '
'FC=\\\\\\\"mpiifort\\\\\\\" FCFLAGS=\\\\\\\"-O3 -xAVX\\\\\\\" '
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests'
'" -t $MPI_TASKS -m 150 -n 50 -b 16 -S $SLURM || { chmod u+rwX -R . ; exit 1 ; }',
"\n",
]
......@@ -597,8 +597,8 @@ band_to_full_blocking = {
}
gpu = {
"no-gpu" : "--disable-gpu",
"with-gpu" : "--enable-gpu --with-cuda-path=\\$CUDA_HOME/",
"no-gpu" : "--disable-nvidia-gpu",
"with-gpu" : "--enable-nvidia-gpu --with-cuda-path=\\$CUDA_HOME/",
}
......
......@@ -110,6 +110,11 @@ if test x"${with_mpi}" = x"yes"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl CPP
AC_LANG_PUSH([C++])
AC_PROG_CXX
AC_LANG_POP([C++])
dnl C
AC_LANG_PUSH([C])
......@@ -718,23 +723,23 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=VALUE],
[use compute capability VALUE for GPU version, default: "sm_35"])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
user_sets_nvidia_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a Nvidia GPU compute capability is specified)
AC_ARG_WITH([NVIDIA-GPU-compute-capability],
[AS_HELP_STRING([--with-NVIDIA-GPU-compute-capability=VALUE],
[use compute capability VALUE for NVIDIA GPU version, default: "sm_35"])],
[user_sets_nvidia_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_nvidia_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
if test x"${user_sets_nvidia_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
AC_MSG_ERROR([Unknown Nvidia GPU compute capability set: ${withval}])
fi
fi
......@@ -877,12 +882,22 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
m4_define(elpa_m4_nvidia_gpu_kernels, [
real_nvidia_gpu
complex_nvidia_gpu
])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_intel_gpu_kernels, [
real_intel_gpu
complex_intel_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -926,7 +941,9 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([intel_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -968,15 +985,37 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
])
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
[Compile and always use the GPU version])],
[],[with_gpu_support_only=no])
if test x"$with_gpu_support_only" = x"yes" ; then
AC_ARG_WITH(nvidia-gpu-support-only, [AS_HELP_STRING([--with-nvidia-gpu-support-only],
[Compile and always use the NVIDIA GPU version])],
[],[with_nvidia_gpu_support_only=no])
if test x"$with_nvidia_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(amd-gpu-support-only, [AS_HELP_STRING([--with-amd-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
if test x"$with_amd_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_amd_gpu=yes
use_complex_amd_gpu=yes
fi
AC_ARG_WITH(intel-gpu-support-only, [AS_HELP_STRING([--with-intel-gpu-support-only],
[Compile and always use the INTEL GPU version])],
[],[with_intel_gpu_support_only=no])
if test x"$with_intel_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
use_real_intel_gpu=yes
use_complex_intel_gpu=yes
fi
......@@ -1053,7 +1092,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1462,11 +1501,11 @@ if test x"${need_bgq}" = x"yes"; then
fi
AC_LANG_POP([Fortran])
#compatibiility flag
AC_MSG_CHECKING(whether GPU version should be used)
AC_ARG_ENABLE([gpu],
AS_HELP_STRING([--enable-gpu],
[do use GPU version]),
[do use Nvidia GPU version (compatibility flag, better set explicitely)]),
[if test x"$enableval" = x"yes"; then
use_gpu=yes
else
......@@ -1475,12 +1514,64 @@ AC_ARG_ENABLE([gpu],
[use_gpu=no])
AC_MSG_RESULT([${use_gpu}])
if test x"${use_gpu}" = x"yes" ; then
need_gpu=yes
use_real_gpu=yes
use_complex_gpu=yes
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
fi
AC_MSG_CHECKING(whether Nvidia-GPU version should be used)
AC_ARG_ENABLE([Nvidia-gpu],
AS_HELP_STRING([--enable-Nvidia-gpu],
[do use Nvidia GPU version]),
[if test x"$enableval" = x"yes"; then
use_nvidia_gpu=yes
else
use_nvidia_gpu=no
fi],
[use_nvidia_gpu=no])
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
fi
#AC_MSG_CHECKING(whether INTEL GPU version should be used)
#AC_ARG_ENABLE([intel-gpu],
# AS_HELP_STRING([--enable-intel-gpu],
# [do use INTEL GPU version]),
# [if test x"$enableval" = x"yes"; then
# use_intel_gpu=yes
# else
# use_intel_gpu=no
# fi],
# [use_intel_gpu=no])
#AC_MSG_RESULT([${use_intel_gpu}])
#if test x"${use_intel_gpu}" = x"yes" ; then
# need_intel_gpu=yes
# use_real_intel_gpu=yes
# use_complex_intel_gpu=yes
#fi
AC_MSG_CHECKING(whether AMD-GPU version should be used)
AC_ARG_ENABLE([AMD-gpu],
AS_HELP_STRING([--enable-AMD-gpu],
[do use AMD GPU version]),
[if test x"$enableval" = x"yes"; then
use_amd_gpu=yes
else
use_amd_gpu=no
fi],
[use_amd_gpu=no])
AC_MSG_RESULT([${use_amd_gpu}])
if test x"${use_amd_gpu}" = x"yes" ; then
need_amd_gpu=yes
use_real_amd_gpu=yes
use_complex_amd_gpu=yes
fi
if test x"${need_gpu}" = x"yes" ; then
if test x"${need_nvidia_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
......@@ -1492,33 +1583,88 @@ if test x"${need_gpu}" = x"yes" ; then
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found; try to set the cuda-path or disable GPU support])
AC_MSG_ERROR([nvcc not found; try to set the cuda-path or disable Nvidia GPU support])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas; try to set the cuda-path or disable GPU support])
AC_MSG_ERROR([Could not link cublas; try to set the cuda-path or disable Nvidia GPU support])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable GPU support])
AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable Nvidia GPU support])
fi
AC_LANG_POP([C])
fi
AC_MSG_CHECKING(whether GPU memory debugging should be enabled)
AC_ARG_ENABLE([gpu-memory-debug],
AS_HELP_STRING([--enable-gpu-memory-debug],
[Output information on GPU memory to be processed by utils/memory/check_memory.py]),
if test x"${need_amd_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
#CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
#LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
#NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
#NVCC="nvcc"
#AC_SUBST(NVCC)
#AC_SUBST(NVCCFLAGS)
dnl check whether hipcc compiler is found
AC_CHECK_PROG(hipcc_found,hipcc,yes,no)
if test x"${hipcc_found}" = x"no" ; then
AC_MSG_ERROR([hipcc not found; try to set the hip-path or disable AMD GPU support])
fi
#dnl check whether we find rocblas
AC_SEARCH_LIBS([rocblas_dgemm],[rocblas],[have_rocblas=yes],[have_rocblas=no])
if test x"${have_rocblas}" = x"no"; then
AC_MSG_ERROR([Could not link rocblas; try to set the hip-path or disable AMD GPU support])
fi
#AC_SEARCH_LIBS([hipMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#if test x"${have_cudart}" = x"no"; then
# AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable Nvidia GPU support])
#fi
#AC_LANG_POP([C])
fi
if test x"${need_intel_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
#CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
#LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
#NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
#NVCC="nvcc"
#AC_SUBST(NVCC)
#AC_SUBST(NVCCFLAGS)
dnl check whether hipcc compiler is found
AC_CHECK_PROG(icx_found,icx,yes,no)
if test x"${icx_found}" = x"no" ; then
AC_MSG_ERROR([icx not found; try to set the oneapi-path or disable INTEL GPU support])
fi
##dnl check whether we find rocblas
#AC_SEARCH_LIBS([rocblas_dgemm],[rocblas],[have_rocblas=yes],[have_rocblas=no])
#if test x"${have_rocblas}" = x"no"; then
# AC_MSG_ERROR([Could not link rocblas; try to set the hip-path or disable AMD GPU support])
#fi
#AC_SEARCH_LIBS([hipMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#if test x"${have_cudart}" = x"no"; then
# AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable Nvidia GPU support])
#fi
#AC_LANG_POP([C])
fi
AC_MSG_CHECKING(whether Nvidia GPU memory debugging should be enabled)
AC_ARG_ENABLE([nvidia-gpu-memory-debug],
AS_HELP_STRING([--enable-nvidia-gpu-memory-debug],
[Output information on Nvidia GPU memory to be processed by utils/memory/check_memory.py]),
[if test x"$enableval" = x"yes"; then
enable_gpu_memory_debug=yes
enable_nvidia_gpu_memory_debug=yes
else
enable_gpu_memory_debug=no
enable_nvidia_gpu_memory_debug=no
fi],
[enable_gpu_memory_debug=no])
AC_MSG_RESULT([${enable_gpu_memory_debug}])
if test x"${enable_gpu_memory_debug}" = x"yes" ; then
[enable_nvidia_gpu_memory_debug=no])
AC_MSG_RESULT([${enable_nvidia_gpu_memory_debug}])
if test x"${enable_nvidia_gpu_memory_debug}" = x"yes" ; then
AC_DEFINE([DEBUG_CUDA],[1],[enable CUDA debugging])
fi
......@@ -1535,17 +1681,17 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kernel)[_COMPILED])
])
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_nvidia_gpu" = x"yes" -o x"$use_complex_nvidia_gpu" = x"yes"])
if test x"$use_real_nvidia_gpu" = x"yes" -o x"$use_complex_nvidia_gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
[build and install nvtx wrapper for profiling th GPU version, default no.]),
[build and install nvtx wrapper for profiling the Nvidia GPU version, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_nvtx=yes
......@@ -1560,16 +1706,66 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_PUSH([C])
AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
if test x"${have_nvtoolsext}" = x"no"; then
AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable GPU support ])
AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable Nvidia GPU support ])
fi
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
AC_DEFINE([WITH_AMD_GPU_VERSION],[1],[enable AMD GPU support])
AC_DEFINE([WITH_AMD_GPU_KERNEL],[1],[AMD GPU kernel should be build])
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=1
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=1
#AC_MSG_CHECKING(whether --enable-nvtx is specified)
#AC_ARG_ENABLE([nvtx],
# AS_HELP_STRING([--enable-nvtx],
# [build and install nvtx wrapper for profiling the Nvidia GPU version, default no.]),
# [
# if test x"$enableval" = x"yes"; then
# enable_nvtx=yes
# else
# enable_nvtx=no
# fi
# ],
# [enable_nvtx=no])
#AC_MSG_RESULT([${enable_nvtx}])
#if test x"${enable_nvtx}" = x"yes"; then
# AC_DEFINE([WITH_NVTX],[1],[enable NVTX support])
# AC_LANG_PUSH([C])
# AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
# if test x"${have_nvtoolsext}" = x"no"; then
# AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable Nvidia GPU support ])
# fi
# AC_LANG_POP([C])
#fi
else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
AC_DEFINE([WITH_INTEL_GPU_VERSION],[1],[enable INTEL GPU support])
AC_DEFINE([WITH_INTEL_GPU_KERNEL],[1],[INTEL GPU kernel should be build])
ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=1
ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=0
ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_INTEL_GPU_COMPILED])
LT_INIT
......
# Installation guide for the *ELPA* library#
# Installation guide for the *ELPA* library #
## Preamble ##
......@@ -384,7 +384,7 @@ Remarks:
FC=mpi_wrapper_for_gnu_Fortran_compiler CC=mpi_wrapper_for_gnu_C_compiler ./configure FCFLAGS="-O3 -march=native -mavx2 -mfma" CFLAGS="-O3 -march=native -mavx2 -mfma -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64"
```
2. Building with Intel Fortran compiler and Intel C compiler:
3. Building with Intel Fortran compiler and Intel C compiler:
Remarks:
- you have to know the name of the Intel Fortran compiler wrapper
......@@ -392,13 +392,111 @@ Remarks:
- you should specify compiler flags for Intel Fortran compiler; in the example only "-O3 -xAVX2" is set
- you should be careful with the CFLAGS, the example shows typical flags
```
FC=mpi_wrapper_for_intel_Fortran_compiler CC=mpi_wrapper_for_intel_C_compiler ./configure FCFLAGS="-O3 -xAVX2" CFLAGS="-O3 -xAVX2" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64"
```
#### Intel cores supporting AVX-512 (Skylake and newer) ####
We recommend that you build ELPA with the Intel compiler (if available) for the Fortran part, but
with GNU compiler for the C part.
1. Building with Intel Fortran compiler and GNU C compiler:
Remarks:
- you have to know the name of the Intel Fortran compiler wrapper
- you do not have to specify a C compiler (with CC); GNU C compiler is recognized automatically
- you should specify compiler flags for Intel Fortran compiler; in the example only `-O3 -xCORE-AVX512` is set
- you should be careful with the CFLAGS, the example shows typical flags
```
FC=mpi_wrapper_for_intel_Fortran_compiler CC=mpi_wrapper_for_gnu_C_compiler ./configure FCFLAGS="-O3 -xCORE-AVX512" CFLAGS="-O3 -march=skylake-avx512 -mfma -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64" --enable-avx2 --enable-avx512
```
2. Building with GNU Fortran compiler and GNU C compiler: