Commit bba73d07 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

ELPA 2021.11.001

See merge request !89
parents b35e0a30 6607079d
This diff is collapsed.
......@@ -2,7 +2,7 @@ Changelog for next release
- not yet decided
Changelog for upcoming ELPA 2021.11.001.rc1
Changelog for upcoming ELPA 2021.11.001
- support of Nvidia cusolver library to accelerate some routines (needs CUDA >= 11.4)
- experimental Nvidia GPU versions for "elpa_invert_trm" and "elpa_cholesky"
......@@ -12,6 +12,10 @@ Changelog for upcoming ELPA 2021.11.001.rc1
- allow to call ELPA eigenvectors and eigenvalues also with GPU device
pointers for the input matrix, the vectors of eigenvalues and the output
matrix for the eigenvectors
- BUGFIX: error in resort_ev
- EXPERIMENTAL feature:g new real GPU kernel for Nvidia A100 (provided by Nvidia): can show a
performance boost if number of vectors per MPI task is > 20000. Most likely
most benifit in non-MPI version
- as anounced, droping the legacy interface
- more autotuning features, for example using non blocking MPI collectives
- new version of autotunig avoiding a combinatorial grow of possibilities
......@@ -19,7 +23,6 @@ Changelog for upcoming ELPA 2021.11.001.rc1
elpa%autotune_set_api_version(API_VERSION, error) is set to API_VERSION <
20211125)
Changelog for ELPA 2021.05.002
- no feature changes
- correct the SO version which was wrong in ELPA 2021.05.001
......
......@@ -3,7 +3,7 @@
For more details and recent updates please visit the online [issue system](https://gitlab.mpcdf.mpg.de/elpa/elpa/issues)
Issues which are not mentioned in a newer release are (considered as) solved.
### ELPA 2021.11.001.rc1 release ###
### ELPA 2021.11.001 release ###
Currently no issues are known
### ELPA 2021.05.002 release ###
......
......@@ -148,6 +148,10 @@ endif
if WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu src/invert_trm/GPU/CUDA/elpa_invert_trm_cuda.cu src/cholesky/GPU/CUDA/elpa_cholesky_cuda.cu
if WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real_sm80.cu src/elpa2/GPU/CUDA/mma_m8n8k4_fp64_sm80.cuh
endif
endif
if WITH_INTEL_GPU_VERSION
......
......@@ -2,7 +2,7 @@
## Current Release ##
The current release is ELPA 2021.11.001.rc1 The current supported API version
The current release is ELPA 2021.11.001. The current supported API version
is 20211125. This release supports the earliest API version 20170403.
The current version for autotuning is also 20211125 and down to version 20170403 ist supported
for autotuning. When the autotune version is set to a value **below** 20211125 the old autotunig
......@@ -133,7 +133,7 @@ the possible configure options.
## Using *ELPA*
Please have a look at the [USERS_GUIDE](./documentation/USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001.rc1/html/index.html) doxygen documentation, where you find the definition of the interfaces. You might want to have a look at the [PERFORMANCE tuning document](./documentation/PERFORMANCE_TUNING.md) to avoid some usual pitfalls.
Please have a look at the [USERS_GUIDE](./documentation/USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001/html/index.html) doxygen documentation, where you find the definition of the interfaces. You might want to have a look at the [PERFORMANCE tuning document](./documentation/PERFORMANCE_TUNING.md) to avoid some usual pitfalls.
## Contributing to *ELPA*
......
......@@ -94,15 +94,15 @@ def set_scalapack_flags(instr, fc, g, m, o):
scalapackldflags="$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP "
scalapackfcflags="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP "
if (g == "with-gpu"):
if (g == "with-gpu" or g == "with-sm80-gpu"):
scalapackldflags += " -L\\$CUDA_HOME/lib64 -Wl,-rpath,\\$CUDA_HOME/lib64 -lcublas -I\\$CUDA_HOME/include"
scalapackfcflags += " -I\\$CUDA_HOME/include"
if (instr == "sse" or (instr == "avx" and g != "with-gpu")):
if (instr == "sse" or (instr == "avx" and g != "with-gpu" and g != "with-sm80-gpu")):
scalapackldflags = " SCALAPACK_LDFLAGS=\\\""+scalapackldflags+"\\\""
scalapackfcflags = " SCALAPACK_FCFLAGS=\\\""+scalapackfcflags+"\\\""
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
scalapackldflags = " SCALAPACK_LDFLAGS=\\\""+"\\"+scalapackldflags+"\\\""
scalapackfcflags = " SCALAPACK_FCFLAGS=\\\""+"\\"+scalapackfcflags+"\\\""
......@@ -598,7 +598,8 @@ band_to_full_blocking = {
gpu = {
"no-gpu" : "--disable-nvidia-gpu",
"with-gpu" : "--enable-nvidia-gpu --with-cuda-path=\\$CUDA_HOME/ --with-NVIDIA-GPU-compute-capability=sm_70",
"with-gpu" : "--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_70 -with-cuda-path=\\$CUDA_HOME/",
"with-sm80-gpu" : "--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_80 -with-cuda-path=\\$CUDA_HOME/" ,
}
......@@ -678,6 +679,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (fc == "pgi" and g !="with-gpu"):
continue
if (fc == "pgi" and g !="with-sm80-gpu"):
continue
mpi_configure_flag = mpi[m]
if (fc == "gnu" and m == "mpi"):
mpi_configure_flag += " --disable-mpi-module"
......@@ -701,6 +704,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (cov == "coverage" and g == "with-gpu"):
continue
if (cov == "coverage" and g == "with-sm80-gpu"):
continue
if (cov == "coverage"):
CFLAGS +=" --coverage -O0"
FCFLAGS +=" --coverage -O0"
......@@ -719,6 +724,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (g == "with-gpu" and addr == "address-sanitize"):
continue
if (g == "with-sm80-gpu" and addr == "address-sanitize"):
continue
if (instr == "knl" and addr == "address-sanitize"):
continue
......@@ -739,14 +746,20 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
#no gpu testing with openmp
if (g == "with-gpu" and o == "openmp"):
continue
if (g == "with-sm80-gpu" and o == "openmp"):
continue
#no gpu testing with intel C compiler (gcc needed)
if (g == "with-gpu" and cc == "intel"):
continue
if (g == "with-sm80-gpu" and cc == "intel"):
continue
#at the moment gpu testing only on AVX machines or minskys
if (g == "with-gpu" and (instr !="avx512" and instr !="power8")):
continue
if (g == "with-sm80-gpu" and (instr !="avx512" and instr !="power8")):
continue
# #on KNL do only intel tests
# if (instr == "knl" and (cc == "gnu" or fc == "gnu")):
......@@ -774,6 +787,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# should be returned when solved
if (g == "with-gpu"):
MasterOnly=True
if (g == "with-sm80-gpu"):
MasterOnly=True
if (a == "no-assumed-size"):
MasterOnly=True
if (instr == "avx2" or instr == "avx512"):
......@@ -803,7 +818,10 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
else:
print(" - gpu")
else:
print(" - " + instr)
if (g == "with-sm80-gpu"):
print(" - gpu_sm80")
else:
print(" - " + instr)
print(" artifacts:")
print(" when: on_success")
print(" expire_in: 2 month")
......@@ -815,13 +833,13 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
memory = set_requested_memory(matrix_size[na])
if (g != "with-gpu"):
if (g != "with-gpu" and g != "with-sm80-gpu"):
gpuJob="no"
else:
gpuJob="yes"
# do the configure
if ( instr == "sse" or (instr == "avx" and g != "with-gpu")):
if ( instr == "sse" or (instr == "avx" and g != "with-gpu" and g != "with-sm80-gpu")):
if ( instr == "sse"):
print(" - if [ $MATRIX_SIZE -gt 150 ]; then export SKIP_STEP=1 ; fi # our SSE test machines do not have a lot of memory")
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
......@@ -830,7 +848,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
+ " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \
+ " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM -g " +gpuJob)
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
print(" - export REQUESTED_MEMORY="+memory)
print("\n")
......@@ -851,7 +869,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# do the test
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
if (o == "openmp"):
if (cov == "no-coverage"):
openmp_threads=" 2 "
......
......@@ -758,6 +758,12 @@ if test x"${user_sets_nvidia_gpu_compute_capability}" = x"yes" ; then
AC_MSG_ERROR([Unknown Nvidia GPU compute capability set: ${withval}])
fi
fi
nvidia_a100_support=no
if test x"$cuda_compute_capability" = x"sm_80" ; then
nvidia_a100_support=yes
AC_DEFINE([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[1],[the NVIDIA GPU kernels for A100 can be used])
fi
AM_CONDITIONAL([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[test x"${nvidia_a100_support}" = x"yes"])
AC_LANG_PUSH([Fortran])
dnl Test possibility of 'use mpi', if requested
......@@ -1016,6 +1022,12 @@ m4_define(elpa_m4_nvidia_gpu_kernels, [
complex_nvidia_gpu
])
m4_define(elpa_m4_nvidia_sm80_gpu_kernels, [
real_nvidia_sm80_gpu
complex_nvidia_sm80_gpu
])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
......@@ -1026,7 +1038,8 @@ m4_define(elpa_m4_intel_gpu_kernels, [
complex_intel_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu])
#m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -1071,6 +1084,7 @@ ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_sm80_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([intel_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
......@@ -1125,6 +1139,17 @@ if test x"$with_nvidia_gpu_support_only" = x"yes" ; then
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(NVIDIA-sm_80_gpu-support-only, [AS_HELP_STRING([--with-NVIDIA-sm_80-gpu-support-only],
[Compile and always use the NVIDIA GPU version for compute capability >= sm_80])],
[],[with_nvidia_sm80_gpu_support_only=no])
if test x"$with_nvidia_sm80_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
AC_ARG_WITH(AMD-gpu-support-only, [AS_HELP_STRING([--with-AMD-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
......@@ -1221,7 +1246,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels nvidia_sm80_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1646,6 +1671,11 @@ if test x"${use_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
# not supported with old flag
#if test x"${nvidia_a100_support}" = x"yes" ; then
# use_real_nvidia_sm80_gpu=yes
# use_complex_nvidia_sm80_gpu=yes
#fi
fi
AC_MSG_CHECKING(whether NVIDIA-GPU version should be used)
......@@ -1663,6 +1693,10 @@ if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
if test x"${nvidia_a100_support}" = x"yes" ; then
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
fi
AC_MSG_CHECKING(whether NVIDIA cusolver library should be used)
......@@ -1879,6 +1913,16 @@ if test x"$use_real_nvidia_gpu" = x"yes" -o x"$use_complex_nvidia_gpu" = x"yes"
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
if test x"${nvidia_a100_support}" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_SM80_GPU_KERNEL],[1],[Nvidia sm_80 GPU kernel should be build])
# currently no complex kernel
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=0
fi
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
......@@ -1907,6 +1951,8 @@ else
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED])
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
......@@ -2290,7 +2336,7 @@ if test x"$have_loop_blocking" = x"yes"; then
fi
AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}.pc])
AC_CONFIG_FILES([
Makefile
......@@ -2481,13 +2527,13 @@ if test x"${ax_cv_have_avx512f_cpu_ext}" = x"yes" -a x"${need_avx512}" = x"no";
echo " "
fi
echo " "
echo "***********************************************************************"
echo "* This is the first release candidate of ELPA 2021.11.001.rc1 *"
echo "* There might be still some changes until the final release of *"
echo "* ELPA 2021.11.001 *"
echo "***********************************************************************"
echo " "
#echo " "
#echo "***********************************************************************"
#echo "* This is the first release candidate of ELPA 2021.11.001.rc1 *"
#echo "* There might be still some changes until the final release of *"
#echo "* ELPA 2021.11.001 *"
#echo "***********************************************************************"
#echo " "
if test x"$enable_kcomputer" = x"yes" ; then
echo " "
......
......@@ -2,13 +2,13 @@
## 0. Preamble ##
This file provides documentation on how to build the *ELPA* library in **version ELPA-2021.11.001.rc1**.
This file provides documentation on how to build the *ELPA* library in **version ELPA-2021.11.001**.
With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
which makes it easier to install the *ELPA* library.
As anounced, with the the release 2021.11.001.rc the **legacy interface has been removed**.
The release of ELPA 2021.11.001.rc1 does change the API and ABI compared to the release 2020.05.002.
The release of ELPA 2021.11.001 does change the API and ABI compared to the release 2020.05.002.
## 1. How to install *ELPA* ##
......@@ -208,7 +208,7 @@ It might be necessary to also set the options (please see configure --help)
--with-GPU-compute-capability
```
Please note that with release 2021.11.001.rc1 also GPU support of AMD and Intel GPUS has been introduced.
Please note that with release 2021.11.001 also GPU support of AMD and Intel GPUS has been introduced.
However, this is still considered experimental. Especially the following features do not yet work, or have not
been tested.
......
......@@ -11,7 +11,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example `man elpa2_print_kernels` should provide the documentation for the *ELPA* program, which prints all
the available kernels.
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001.rc1/html/index.html)
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001/html/index.html)
for each *ELPA* release is available.
......@@ -178,7 +178,7 @@ The following table gives a list of all supported parameters which can be used t
## III) List of computational routines ##
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001.rc1/html/index.html) for details.
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.11.001/html/index.html) for details.
| Name | Purpose | since API version |
......
......@@ -19,7 +19,7 @@
%define with_openmp 0
Name: elpa
Version: 2021.11.001.rc1
Version: 2021.11.001
Release: 1
Summary: A massively parallel eigenvector solver
License: LGPL-3.0
......
......@@ -72,7 +72,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_SM80_GPU, 41, @ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -108,7 +109,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU, 25, @ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -6,13 +6,14 @@
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define AMD_GPU_INSTR 9
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 16
#define NVIDIA_SM80_INSTR 9
#define AMD_GPU_INSTR 10
#define INTEL_GPU_INSTR 11
#define VSX_INSTR 12
#define ARCH64_INSTR 13
#define SPARC_INSTR 14
#define SVE128_INSTR 15
#define SVE256_INSTR 16
#define SVE512_INSTR 17
#define NUMBER_OF_INSTR 17
#define NUMBER_OF_INSTR 18
......@@ -62,7 +62,7 @@
__global__ void copy_double_a_tmatc_kernel(double *a_dev, double *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1, const int nblk){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows];
}
......@@ -86,7 +86,7 @@ extern "C" void copy_double_a_tmatc_FromC(double *a_dev, double *tmatc_dev, int
__global__ void copy_float_a_tmatc_kernel(float *a_dev, float *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1, const int nblk){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows];
}
......@@ -110,7 +110,7 @@ extern "C" void copy_float_a_tmatc_FromC(float *a_dev, float *tmatc_dev, int *nb
__global__ void copy_double_complex_a_tmatc_kernel(cuDoubleComplex *a_dev, cuDoubleComplex *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = cuConj(a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows]);
}
......@@ -137,7 +137,7 @@ extern "C" void copy_double_complex_a_tmatc_FromC(double _Complex *a_dev, double
__global__ void copy_float_complex_a_tmatc_kernel(cuFloatComplex *a_dev, cuFloatComplex *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = cuConjf(a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows]);
}
......
......@@ -55,7 +55,6 @@
use mod_check_for_gpu
use invert_trm_cuda, only : copy_PRECISION_tmp1_tmp2, &
copy_PRECISION_a_tmp1
use cholesky_cuda
implicit none
#include "../general/precision_kinds.F90"
......@@ -472,7 +471,7 @@
endif ! (my_pcol==pcol(n, nblk, np_cols))
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), tmp1_dev, num, &
......@@ -480,9 +479,9 @@
check_memcpy_gpu("elpa_cholesky: tmp1_dev to tmp1", successGPU)
endif
#endif
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(tmp1, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
......@@ -495,26 +494,29 @@
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
call obj%timer%start("mpi_cuda_communication")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
#endif
!#else
! tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk,nblk])
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("cholesky: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
! call obj%timer%start("mpi_cuda_communication")
!
! call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
!#if REALCASE == 1
! MPI_REAL_PRECISION, &
!#endif
!#if COMPLEXCASE == 1
! MPI_COMPLEX_PRECISION, &
!#endif
! int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
!
! call obj%timer%stop("mpi_cuda_communication")
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(tmp1_dev, int(loc(tmp1),kind=c_intptr_t), num, &
......@@ -522,7 +524,7 @@
check_memcpy_gpu("elpa_cholesky: tmp1 to tmp1_dev", successGPU)
endif
#endif
!#endif
#endif /* WITH_MPI */
......@@ -559,7 +561,9 @@
if (useGPU) then
if (my_prow==prow(n, nblk, np_rows)) then
call copy_PRECISION_a_tmatc(a_dev, tmatc_dev, nblk, matrixRows, l_cols, l_colx, l_row1)
! if l_cols-l_colx+1 == 0 kernel launch with 0 blocks => raises error
if (l_cols-l_colx+1>0) &
call copy_PRECISION_a_tmatc(a_dev, tmatc_dev, nblk, matrixRows, l_cols, l_colx, l_row1)
endif
else ! useGPU
do i=1,nblk
......@@ -573,7 +577,7 @@
endif ! useGPU
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
if (l_cols-l_colx+1 > 0) then
num = l_cols*nblk*size_of_datatype
......@@ -582,12 +586,12 @@
check_memcpy_gpu("elpa_cholesky: tmatc_dev to tmatc", successGPU)
endif
endif
#endif
!#endif
#endif /* WITH_MPI */
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
do i=1,nblk
call obj%timer%start("mpi_communication")
if (l_cols-l_colx+1>0) &
......@@ -596,25 +600,30 @@
call obj%timer%stop("mpi_communication")
enddo
#else
tmatc_mpi_dev = transfer(tmatc_dev, tmatc_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmatc_mpi_dev, tmatc_mpi_fortran_ptr, [l_cols,nblk])
do i=1,nblk
call obj%timer%start("mpi_cuda_communication")
if (l_cols-l_colx+1>0) &
call MPI_Bcast(tmatc_mpi_fortran_ptr(l_colx,i), int(l_cols-l_colx+1,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION, &
int(prow(n, nblk, np_rows),kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
enddo
#endif
!#else
! tmatc_mpi_dev = transfer(tmatc_dev, tmatc_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmatc_mpi_dev, tmatc_mpi_fortran_ptr, [l_cols,nblk])
!
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("cholesky: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
!