Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1bbf7864
Commit
1bbf7864
authored
Dec 08, 2021
by
Andreas Marek
Browse files
Merge branch 'a100_kernel' into 'master_pre_stage'
A100 kernel See merge request
!88
parents
8e8860a9
d95560ce
Changes
26
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
1bbf7864
This diff is collapsed.
Click to expand it.
Changelog
View file @
1bbf7864
...
...
@@ -12,6 +12,10 @@ Changelog for upcoming ELPA 2021.11.001.rc1
- allow to call ELPA eigenvectors and eigenvalues also with GPU device
pointers for the input matrix, the vectors of eigenvalues and the output
matrix for the eigenvectors
- BUGFIX: error in resort_ev
- EXPERIMENTAL feature:g new real GPU kernel for Nvidia A100 (provided by Nvidia): can show a
performance boost if number of vectors per MPI task is > 20000. Most likely
most benifit in non-MPI version
- as anounced, droping the legacy interface
- more autotuning features, for example using non blocking MPI collectives
- new version of autotunig avoiding a combinatorial grow of possibilities
...
...
@@ -19,7 +23,6 @@ Changelog for upcoming ELPA 2021.11.001.rc1
elpa%autotune_set_api_version(API_VERSION, error) is set to API_VERSION <
20211125)
Changelog for ELPA 2021.05.002
- no feature changes
- correct the SO version which was wrong in ELPA 2021.05.001
...
...
Makefile.am
View file @
1bbf7864
...
...
@@ -148,6 +148,10 @@ endif
if
WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu src/invert_trm/GPU/CUDA/elpa_invert_trm_cuda.cu src/cholesky/GPU/CUDA/elpa_cholesky_cuda.cu
if
WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real_sm80.cu src/elpa2/GPU/CUDA/mma_m8n8k4_fp64_sm80.cuh
endif
endif
if
WITH_INTEL_GPU_VERSION
...
...
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
1bbf7864
...
...
@@ -94,15 +94,15 @@ def set_scalapack_flags(instr, fc, g, m, o):
scalapackldflags
=
"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP "
scalapackfcflags
=
"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP "
if
(
g
==
"with-gpu"
):
if
(
g
==
"with-gpu"
or
g
==
"with-sm80-gpu"
):
scalapackldflags
+=
" -L
\\
$CUDA_HOME/lib64 -Wl,-rpath,
\\
$CUDA_HOME/lib64 -lcublas -I
\\
$CUDA_HOME/include"
scalapackfcflags
+=
" -I
\\
$CUDA_HOME/include"
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
)):
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
and
g
!=
"with-sm80-gpu"
)):
scalapackldflags
=
" SCALAPACK_LDFLAGS=
\\\"
"
+
scalapackldflags
+
"
\\\"
"
scalapackfcflags
=
" SCALAPACK_FCFLAGS=
\\\"
"
+
scalapackfcflags
+
"
\\\"
"
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
):
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
or
g
==
"with-sm80-gpu"
):
scalapackldflags
=
" SCALAPACK_LDFLAGS=
\\\"
"
+
"
\\
"
+
scalapackldflags
+
"
\\\"
"
scalapackfcflags
=
" SCALAPACK_FCFLAGS=
\\\"
"
+
"
\\
"
+
scalapackfcflags
+
"
\\\"
"
...
...
@@ -598,7 +598,8 @@ band_to_full_blocking = {
gpu
=
{
"no-gpu"
:
"--disable-nvidia-gpu"
,
"with-gpu"
:
"--enable-nvidia-gpu --with-cuda-path=
\\
$CUDA_HOME/ --with-NVIDIA-GPU-compute-capability=sm_70"
,
"with-gpu"
:
"--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_70 -with-cuda-path=
\\
$CUDA_HOME/"
,
"with-sm80-gpu"
:
"--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_80 -with-cuda-path=
\\
$CUDA_HOME/"
,
}
...
...
@@ -678,6 +679,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if
(
fc
==
"pgi"
and
g
!=
"with-gpu"
):
continue
if
(
fc
==
"pgi"
and
g
!=
"with-sm80-gpu"
):
continue
mpi_configure_flag
=
mpi
[
m
]
if
(
fc
==
"gnu"
and
m
==
"mpi"
):
mpi_configure_flag
+=
" --disable-mpi-module"
...
...
@@ -701,6 +704,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if
(
cov
==
"coverage"
and
g
==
"with-gpu"
):
continue
if
(
cov
==
"coverage"
and
g
==
"with-sm80-gpu"
):
continue
if
(
cov
==
"coverage"
):
CFLAGS
+=
" --coverage -O0"
FCFLAGS
+=
" --coverage -O0"
...
...
@@ -719,6 +724,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if
(
g
==
"with-gpu"
and
addr
==
"address-sanitize"
):
continue
if
(
g
==
"with-sm80-gpu"
and
addr
==
"address-sanitize"
):
continue
if
(
instr
==
"knl"
and
addr
==
"address-sanitize"
):
continue
...
...
@@ -739,14 +746,20 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
#no gpu testing with openmp
if
(
g
==
"with-gpu"
and
o
==
"openmp"
):
continue
if
(
g
==
"with-sm80-gpu"
and
o
==
"openmp"
):
continue
#no gpu testing with intel C compiler (gcc needed)
if
(
g
==
"with-gpu"
and
cc
==
"intel"
):
continue
if
(
g
==
"with-sm80-gpu"
and
cc
==
"intel"
):
continue
#at the moment gpu testing only on AVX machines or minskys
if
(
g
==
"with-gpu"
and
(
instr
!=
"avx512"
and
instr
!=
"power8"
)):
continue
if
(
g
==
"with-sm80-gpu"
and
(
instr
!=
"avx512"
and
instr
!=
"power8"
)):
continue
# #on KNL do only intel tests
# if (instr == "knl" and (cc == "gnu" or fc == "gnu")):
...
...
@@ -774,6 +787,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# should be returned when solved
if
(
g
==
"with-gpu"
):
MasterOnly
=
True
if
(
g
==
"with-sm80-gpu"
):
MasterOnly
=
True
if
(
a
==
"no-assumed-size"
):
MasterOnly
=
True
if
(
instr
==
"avx2"
or
instr
==
"avx512"
):
...
...
@@ -803,7 +818,10 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
else
:
print
(
" - gpu"
)
else
:
print
(
" - "
+
instr
)
if
(
g
==
"with-sm80-gpu"
):
print
(
" - gpu_sm80"
)
else
:
print
(
" - "
+
instr
)
print
(
" artifacts:"
)
print
(
" when: on_success"
)
print
(
" expire_in: 2 month"
)
...
...
@@ -815,13 +833,13 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
memory
=
set_requested_memory
(
matrix_size
[
na
])
if
(
g
!=
"with-gpu"
):
if
(
g
!=
"with-gpu"
and
g
!=
"with-sm80-gpu"
):
gpuJob
=
"no"
else
:
gpuJob
=
"yes"
# do the configure
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
)):
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
and
g
!=
"with-sm80-gpu"
)):
if
(
instr
==
"sse"
):
print
(
" - if [ $MATRIX_SIZE -gt 150 ]; then export SKIP_STEP=1 ; fi # our SSE test machines do not have a lot of memory"
)
print
(
" - ./ci_test_scripts/run_ci_tests.sh -c
\"
CC=
\\\"
"
+
c_compiler_wrapper
+
"
\\\"
"
+
" CFLAGS=
\\\"
"
+
CFLAGS
+
"
\\\"
"
+
" FC=
\\\"
"
+
fortran_compiler_wrapper
+
"
\\\"
"
+
" FCFLAGS=
\\\"
"
+
FCFLAGS
+
"
\\\"
"
\
...
...
@@ -830,7 +848,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
+
" "
+
precision
[
p
]
+
" "
+
assumed_size
[
a
]
+
" "
+
band_to_full_blocking
[
b
]
\
+
" "
+
gpu
[
g
]
+
INSTRUCTION_OPTIONS
+
"
\"
-j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM -g "
+
gpuJob
)
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
):
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
or
g
==
"with-sm80-gpu"
):
print
(
" - export REQUESTED_MEMORY="
+
memory
)
print
(
"
\n
"
)
...
...
@@ -851,7 +869,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# do the test
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
):
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
or
g
==
"with-sm80-gpu"
):
if
(
o
==
"openmp"
):
if
(
cov
==
"no-coverage"
):
openmp_threads
=
" 2 "
...
...
configure.ac
View file @
1bbf7864
...
...
@@ -758,6 +758,12 @@ if test x"${user_sets_nvidia_gpu_compute_capability}" = x"yes" ; then
AC_MSG_ERROR([Unknown Nvidia GPU compute capability set: ${withval}])
fi
fi
nvidia_a100_support=no
if test x"$cuda_compute_capability" = x"sm_80" ; then
nvidia_a100_support=yes
AC_DEFINE([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[1],[the NVIDIA GPU kernels for A100 can be used])
fi
AM_CONDITIONAL([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[test x"${nvidia_a100_support}" = x"yes"])
AC_LANG_PUSH([Fortran])
dnl Test possibility of 'use mpi', if requested
...
...
@@ -1016,6 +1022,12 @@ m4_define(elpa_m4_nvidia_gpu_kernels, [
complex_nvidia_gpu
])
m4_define(elpa_m4_nvidia_sm80_gpu_kernels, [
real_nvidia_sm80_gpu
complex_nvidia_sm80_gpu
])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
...
...
@@ -1026,7 +1038,8 @@ m4_define(elpa_m4_intel_gpu_kernels, [
complex_intel_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu])
#m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -1071,6 +1084,7 @@ ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_sm80_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([intel_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
...
...
@@ -1125,6 +1139,17 @@ if test x"$with_nvidia_gpu_support_only" = x"yes" ; then
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(NVIDIA-sm_80_gpu-support-only, [AS_HELP_STRING([--with-NVIDIA-sm_80-gpu-support-only],
[Compile and always use the NVIDIA GPU version for compute capability >= sm_80])],
[],[with_nvidia_sm80_gpu_support_only=no])
if test x"$with_nvidia_sm80_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
AC_ARG_WITH(AMD-gpu-support-only, [AS_HELP_STRING([--with-AMD-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
...
...
@@ -1221,7 +1246,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels
nvidia_sm80_gpu_kernels
,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -1646,6 +1671,11 @@ if test x"${use_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
# not supported with old flag
#if test x"${nvidia_a100_support}" = x"yes" ; then
# use_real_nvidia_sm80_gpu=yes
# use_complex_nvidia_sm80_gpu=yes
#fi
fi
AC_MSG_CHECKING(whether NVIDIA-GPU version should be used)
...
...
@@ -1663,6 +1693,10 @@ if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
if test x"${nvidia_a100_support}" = x"yes" ; then
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
fi
AC_MSG_CHECKING(whether NVIDIA cusolver library should be used)
...
...
@@ -1879,6 +1913,16 @@ if test x"$use_real_nvidia_gpu" = x"yes" -o x"$use_complex_nvidia_gpu" = x"yes"
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
if test x"${nvidia_a100_support}" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_SM80_GPU_KERNEL],[1],[Nvidia sm_80 GPU kernel should be build])
# currently no complex kernel
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=0
fi
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
...
...
@@ -1907,6 +1951,8 @@ else
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED])
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
...
...
elpa/elpa_constants.h.in
View file @
1bbf7864
...
...
@@ -72,7 +72,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_SM80_GPU, 41, @ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
@@ -108,7 +109,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU, 25, @ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
elpa/elpa_simd_constants.h
View file @
1bbf7864
...
...
@@ -6,13 +6,14 @@
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define AMD_GPU_INSTR 9
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 16
#define NVIDIA_SM80_INSTR 9
#define AMD_GPU_INSTR 10
#define INTEL_GPU_INSTR 11
#define VSX_INSTR 12
#define ARCH64_INSTR 13
#define SPARC_INSTR 14
#define SVE128_INSTR 15
#define SVE256_INSTR 16
#define SVE512_INSTR 17
#define NUMBER_OF_INSTR 1
7
#define NUMBER_OF_INSTR 1
8
src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real_sm80.cu
0 → 100644
View file @
1bbf7864
/* This file contains modified/adapted version of the original implementation kindly
* provided by NVIDIA under the MIT License. The unmodified version can be found
* in the src at src/shipped_srcs/NVIDIA_A100_kernel/
*
* Nov 2021, A. Marek, MPCDF
*
*/
#include
<stdlib.h>
#include
<stdio.h>
#include
<cuda_runtime.h>
#include
<cub/cub.cuh>
#define USE_MMA // On Ampere the double precision tensor cores (DMMA) are available
#ifdef USE_MMA
#include
"mma_m8n8k4_fp64_sm80.cuh"
#else
template
<
int
bK
,
int
bN
>
__device__
inline
int
shared_memory_offset
(
int
k
,
int
n
)
{
// Shared memory layout for non-MMA version.
return
k
*
bN
+
n
;
}
__device__
inline
constexpr
int
shared_memory_bytes
(
int
bK
,
int
bN
)
{
// Shared memory size for the bM by bK matrix. Version for the non-MMA.
return
bN
*
bK
;
}
#endif
/*
Householder transformation
This is based the on the original warp sync version shown above.
(I - tau * hh * hh^T) * q = q - tau * hh * hh^T * q
Name here : Name in paper
q : X
hh : v
hh_tau : tau
nev : N_C
nb : nbw (==b)
ncols : N_R (==n+b-1)
*/
template
<
typename
T
,
int
bM
,
int
bN
,
int
block_y
,
int
block_z
>
__global__
void
compute_hh_trafo_gpu_new
(
T
*
__restrict__
q
,
const
T
*
__restrict__
hh
,
const
T
*
__restrict__
hh_tau
,
const
int
nev
,
const
int
nb
,
const
int
ldq
,
const
int
ncols
)
{
constexpr
int
bK
=
bM
;
extern
__shared__
int
smem
[];
T
*
hh_s
=
reinterpret_cast
<
T
*>
(
smem
);
T
*
q_s
=
&
hh_s
[
bM
];
T
*
hh_tau_s
=
&
q_s
[
shared_memory_bytes
(
bK
,
bN
)];
#ifdef USE_MMA
T
*
sum_s
=
&
hh_tau_s
[
1
];
// Shared memory buffer if we perform the inner product with DMMA.
#endif
int
j
=
ncols
;
int
bid
=
blockIdx
.
y
*
bN
;
// n-index offset for this block.
for
(
int
k
=
threadIdx
.
z
;
k
<
bK
;
k
+=
block_z
)
{
for
(
int
n
=
threadIdx
.
y
;
n
<
bN
;
n
+=
block_y
)
{
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
k
,
n
)]
=
(
n
+
bid
)
<
nev
?
q
[(
j
+
k
-
1
)
*
ldq
+
n
+
bid
]
:
0
;
}
}
constexpr
int
thread_m_dim
=
bM
/
block_z
;
constexpr
int
thread_n_dim
=
bN
/
block_y
;
T
reg
[
thread_n_dim
*
thread_m_dim
];
while
(
j
>=
1
)
{
int
hh_idx
=
threadIdx
.
z
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
hh_idx
==
0
)
{
*
hh_tau_s
=
hh_tau
[
j
-
1
];
}
while
(
hh_idx
<
nb
)
{
hh_s
[
hh_idx
]
=
hh
[
hh_idx
+
(
j
-
1
)
*
nb
];
hh_idx
+=
blockDim
.
z
*
blockDim
.
y
;
}
if
(
j
<
ncols
&&
threadIdx
.
z
==
0
)
{
for
(
int
n
=
threadIdx
.
y
;
n
<
bN
;
n
+=
block_y
)
{
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
0
,
n
)]
=
(
n
+
bid
)
<
nev
?
q
[(
j
+
0
-
1
)
*
ldq
+
n
+
bid
]
:
0
;
}
}
/**
If we use DMMA to perform the inner product, call the routine here and store results on the buffer.
If not, for each eigenvector, for each thread we calculate the `sum`.
*/
#ifdef USE_MMA
__syncthreads
();
sum
<
bK
,
bN
,
block_z
*
block_y
/
32
>
(
hh_s
,
q_s
,
sum_s
);
__syncthreads
();
#endif
#pragma unroll
for
(
int
n
=
0
;
n
<
thread_n_dim
;
n
++
)
{
int
n_idx
=
threadIdx
.
y
+
n
*
block_y
;
#ifndef USE_MMA
T
sum
=
0
;
#pragma unroll 1
for
(
int
k
=
0
;
k
<
bK
;
k
++
)
{
sum
+=
hh_s
[
k
]
*
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
k
,
n_idx
)];
}
#endif
#pragma unroll
for
(
int
m
=
0
;
m
<
thread_m_dim
;
m
++
)
{
int
m_idx
=
threadIdx
.
z
+
m
*
block_z
;
#ifdef USE_MMA
reg
[
m
*
thread_n_dim
+
n
]
=
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
m_idx
,
n_idx
)]
-
*
hh_tau_s
*
hh_s
[
m_idx
]
*
sum_s
[
n_idx
];
#else
reg
[
m
*
thread_n_dim
+
n
]
=
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
m_idx
,
n_idx
)]
-
*
hh_tau_s
*
hh_s
[
m_idx
]
*
sum
;
#endif
if
(
j
==
1
||
m_idx
==
bM
-
1
)
{
if
(
n_idx
+
bid
<
nev
)
{
q
[(
m_idx
+
j
-
1
)
*
ldq
+
n_idx
+
bid
]
=
reg
[
m
*
thread_n_dim
+
n
];
}
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
m
=
0
;
m
<
thread_m_dim
;
m
++
)
{
#pragma unroll
for
(
int
n
=
0
;
n
<
thread_n_dim
;
n
++
)
{
int
m_idx
=
threadIdx
.
z
+
m
*
block_z
;
int
n_idx
=
threadIdx
.
y
+
n
*
block_y
;
if
(
m_idx
+
1
<
bM
)
{
q_s
[
shared_memory_offset
<
bK
,
bN
>
(
m_idx
+
1
,
n_idx
)]
=
reg
[
m
*
thread_n_dim
+
n
];
}
}
}
j
-=
1
;
}
}
void
set_max_shared_bytes
(
const
void
*
func
)
{
// Set such that this kernel can use the maximum shared memory available.
cudaFuncSetAttribute
(
func
,
cudaFuncAttributePreferredSharedMemoryCarveout
,
(
int
)
cudaSharedmemCarveoutMaxShared
);
int
max_shared_bytes
;
cudaDeviceGetAttribute
(
&
max_shared_bytes
,
cudaDevAttrMaxSharedMemoryPerBlockOptin
,
0
);
cudaFuncSetAttribute
(
func
,
cudaFuncAttributeMaxDynamicSharedMemorySize
,
max_shared_bytes
);
}
template
<
int
bM
,
class
F
>
void
launch_NVIDIA_sm80_kernel
(
F
*
q
,
const
F
*
hh
,
const
F
*
hh_tau
,
const
int
nev
,
const
int
nb
,
const
int
ldq
,
const
int
ncols
)
{
#ifdef USE_MMA
// This is set such that shared memory bank conflicts are minimized.
constexpr
int
block_y
=
bM
<
64
?
8
:
4
;
constexpr
int
block_z
=
bM
<
64
?
4
:
8
;
#else
constexpr
int
block_y
=
8
;
constexpr
int
block_z
=
4
;
#endif
constexpr
int
bN
=
8
;
auto
kernel
=
compute_hh_trafo_gpu_new
<
double
,
bM
,
bN
,
block_y
,
block_z
>
;
set_max_shared_bytes
((
const
void
*
)
kernel
);
#ifdef USE_MMA
int
shared_bytes
=
(
bM
+
shared_memory_bytes
(
bM
,
bN
)
+
bN
+
1
)
*
sizeof
(
F
);
#else
int
shared_bytes
=
(
bM
+
shared_memory_bytes
(
bM
,
bN
)
+
1
)
*
sizeof
(
F
);
#endif
int
grid_y
=
(
nev
+
bN
-
1
)
/
bN
;
kernel
<<<
dim3
(
1
,
grid_y
,
1
),
dim3
(
1
,
block_y
,
block_z
),
shared_bytes
>>>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
}
/*
Name here : Name in paper
q : X
hh : v
hh_tau : tau
nev : N_C
nb : nbw (==b)
ncols : N_R (==n+b-1)
*/
extern
"C"
{
void
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double
(
double
*
q
,
const
double
*
hh
,
const
double
*
hh_tau
,
const
int
nev
,
const
int
nb
,
const
int
ldq
,
const
int
ncols
)
{
switch
(
nb
)
{
case
1024
:
launch_NVIDIA_sm80_kernel
<
1024
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
512
:
launch_NVIDIA_sm80_kernel
<
512
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
256
:
launch_NVIDIA_sm80_kernel
<
256
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
128
:
launch_NVIDIA_sm80_kernel
<
128
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
64
:
launch_NVIDIA_sm80_kernel
<
64
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
32
:
launch_NVIDIA_sm80_kernel
<
32
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
16
:
launch_NVIDIA_sm80_kernel
<
16
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
8
:
launch_NVIDIA_sm80_kernel
<
8
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
case
4
:
launch_NVIDIA_sm80_kernel
<
4
>
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
);
break
;
//case 2: launch_new_kernel< 2>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
//case 1: launch_new_kernel< 1>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
default:
printf
(
"Unsupported nb = %d for new kernel
\n
"
,
nb
);
}
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
compute_hh_trafo sm80 CUDA kernel failed: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
void
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_single
(
float
*
q
,
const
float
*
hh
,
const
float
*
hh_tau
,
const
int
nev
,
const
int
nb
,
const
int
ldq
,
const
int
ncols
)
{
double
*
q_casted
,
*
hh_casted
,
*
hh_tau_casted
;
q_casted
=
(
double
*
)
q
;
hh_casted
=
(
double
*
)
hh
;
hh_tau_casted
=
(
double
*
)
hh_tau
;
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double
(
q_casted
,
hh_casted
,
hh_tau_casted
,
nev
,
nb
,
ldq
,
ncols
);
q
=
(
float
*
)
q_casted
;
}
}
src/elpa2/GPU/CUDA/interface_c_cuda_kernel.F90
View file @
1bbf7864
...
...
@@ -59,6 +59,17 @@ module cuda_c_kernel
end
subroutine
end
interface
interface
subroutine
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
)
&
bind
(
c
)
use
,
intrinsic
::
iso_c_binding
implicit
none
integer
(
kind
=
c_int
),
value
::
nev
,
nb
,
ldq
,
ncols
integer
(
kind
=
c_intptr_t
),
value
::
q
integer
(
c_intptr_t
),
value
::
hh_tau
,
hh
end
subroutine
end
interface
#ifdef WANT_SINGLE_PRECISION_REAL
interface
subroutine
launch_compute_hh_trafo_c_cuda_kernel_real_single
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
)
&
...
...
@@ -70,7 +81,19 @@ module cuda_c_kernel
integer
(
c_intptr_t
),
value
::
hh_tau
,
hh
end
subroutine
end
interface
#endif
interface
subroutine
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_single
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
)
&
bind
(
c
)
use
,
intrinsic
::
iso_c_binding
implicit
none
integer
(
kind
=
c_int
),
value
::
nev
,
nb
,
ldq
,
ncols
integer
(
kind
=
c_intptr_t
),
value
::
q
integer
(
c_intptr_t
),
value
::
hh_tau
,
hh
end
subroutine
end
interface
#endif /* WANT_SINGLE_PRECISION_REAL */
interface
subroutine
launch_compute_hh_trafo_c_cuda_kernel_complex_double
(
q
,
hh
,
hh_tau
,
nev
,
nb
,
ldq
,
ncols
)
&
...
...
@@ -83,6 +106,17 @@ module cuda_c_kernel
end
subroutine
end
interface
!interface
! subroutine launch_compute_hh_trafo_c_cuda_sm80_kernel_complex_double(q, hh, hh_tau, nev, nb, ldq, ncols) &
! bind(c)
! use, intrinsic :: iso_c_binding
! implicit none
! integer(kind=c_int), value :: nev, nb, ldq, ncols
! integer(kind=c_intptr_t), value :: q
! integer(kind=c_intptr_t), value :: hh_tau ,hh
! end subroutine
!end interface
#ifdef WANT_SINGLE_PRECISION_COMPLEX
interface