Commit 1bbf7864 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'a100_kernel' into 'master_pre_stage'

A100 kernel

See merge request !88
parents 8e8860a9 d95560ce
This diff is collapsed.
......@@ -12,6 +12,10 @@ Changelog for upcoming ELPA 2021.11.001.rc1
- allow to call ELPA eigenvectors and eigenvalues also with GPU device
pointers for the input matrix, the vectors of eigenvalues and the output
matrix for the eigenvectors
- BUGFIX: error in resort_ev
- EXPERIMENTAL feature:g new real GPU kernel for Nvidia A100 (provided by Nvidia): can show a
performance boost if number of vectors per MPI task is > 20000. Most likely
most benifit in non-MPI version
- as anounced, droping the legacy interface
- more autotuning features, for example using non blocking MPI collectives
- new version of autotunig avoiding a combinatorial grow of possibilities
......@@ -19,7 +23,6 @@ Changelog for upcoming ELPA 2021.11.001.rc1
elpa%autotune_set_api_version(API_VERSION, error) is set to API_VERSION <
20211125)
Changelog for ELPA 2021.05.002
- no feature changes
- correct the SO version which was wrong in ELPA 2021.05.001
......
......@@ -148,6 +148,10 @@ endif
if WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu src/invert_trm/GPU/CUDA/elpa_invert_trm_cuda.cu src/cholesky/GPU/CUDA/elpa_cholesky_cuda.cu
if WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real_sm80.cu src/elpa2/GPU/CUDA/mma_m8n8k4_fp64_sm80.cuh
endif
endif
if WITH_INTEL_GPU_VERSION
......
......@@ -94,15 +94,15 @@ def set_scalapack_flags(instr, fc, g, m, o):
scalapackldflags="$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP "
scalapackfcflags="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP "
if (g == "with-gpu"):
if (g == "with-gpu" or g == "with-sm80-gpu"):
scalapackldflags += " -L\\$CUDA_HOME/lib64 -Wl,-rpath,\\$CUDA_HOME/lib64 -lcublas -I\\$CUDA_HOME/include"
scalapackfcflags += " -I\\$CUDA_HOME/include"
if (instr == "sse" or (instr == "avx" and g != "with-gpu")):
if (instr == "sse" or (instr == "avx" and g != "with-gpu" and g != "with-sm80-gpu")):
scalapackldflags = " SCALAPACK_LDFLAGS=\\\""+scalapackldflags+"\\\""
scalapackfcflags = " SCALAPACK_FCFLAGS=\\\""+scalapackfcflags+"\\\""
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
scalapackldflags = " SCALAPACK_LDFLAGS=\\\""+"\\"+scalapackldflags+"\\\""
scalapackfcflags = " SCALAPACK_FCFLAGS=\\\""+"\\"+scalapackfcflags+"\\\""
......@@ -598,7 +598,8 @@ band_to_full_blocking = {
gpu = {
"no-gpu" : "--disable-nvidia-gpu",
"with-gpu" : "--enable-nvidia-gpu --with-cuda-path=\\$CUDA_HOME/ --with-NVIDIA-GPU-compute-capability=sm_70",
"with-gpu" : "--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_70 -with-cuda-path=\\$CUDA_HOME/",
"with-sm80-gpu" : "--enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability=sm_80 -with-cuda-path=\\$CUDA_HOME/" ,
}
......@@ -678,6 +679,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (fc == "pgi" and g !="with-gpu"):
continue
if (fc == "pgi" and g !="with-sm80-gpu"):
continue
mpi_configure_flag = mpi[m]
if (fc == "gnu" and m == "mpi"):
mpi_configure_flag += " --disable-mpi-module"
......@@ -701,6 +704,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (cov == "coverage" and g == "with-gpu"):
continue
if (cov == "coverage" and g == "with-sm80-gpu"):
continue
if (cov == "coverage"):
CFLAGS +=" --coverage -O0"
FCFLAGS +=" --coverage -O0"
......@@ -719,6 +724,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
continue
if (g == "with-gpu" and addr == "address-sanitize"):
continue
if (g == "with-sm80-gpu" and addr == "address-sanitize"):
continue
if (instr == "knl" and addr == "address-sanitize"):
continue
......@@ -739,14 +746,20 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
#no gpu testing with openmp
if (g == "with-gpu" and o == "openmp"):
continue
if (g == "with-sm80-gpu" and o == "openmp"):
continue
#no gpu testing with intel C compiler (gcc needed)
if (g == "with-gpu" and cc == "intel"):
continue
if (g == "with-sm80-gpu" and cc == "intel"):
continue
#at the moment gpu testing only on AVX machines or minskys
if (g == "with-gpu" and (instr !="avx512" and instr !="power8")):
continue
if (g == "with-sm80-gpu" and (instr !="avx512" and instr !="power8")):
continue
# #on KNL do only intel tests
# if (instr == "knl" and (cc == "gnu" or fc == "gnu")):
......@@ -774,6 +787,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# should be returned when solved
if (g == "with-gpu"):
MasterOnly=True
if (g == "with-sm80-gpu"):
MasterOnly=True
if (a == "no-assumed-size"):
MasterOnly=True
if (instr == "avx2" or instr == "avx512"):
......@@ -803,7 +818,10 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
else:
print(" - gpu")
else:
print(" - " + instr)
if (g == "with-sm80-gpu"):
print(" - gpu_sm80")
else:
print(" - " + instr)
print(" artifacts:")
print(" when: on_success")
print(" expire_in: 2 month")
......@@ -815,13 +833,13 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
memory = set_requested_memory(matrix_size[na])
if (g != "with-gpu"):
if (g != "with-gpu" and g != "with-sm80-gpu"):
gpuJob="no"
else:
gpuJob="yes"
# do the configure
if ( instr == "sse" or (instr == "avx" and g != "with-gpu")):
if ( instr == "sse" or (instr == "avx" and g != "with-gpu" and g != "with-sm80-gpu")):
if ( instr == "sse"):
print(" - if [ $MATRIX_SIZE -gt 150 ]; then export SKIP_STEP=1 ; fi # our SSE test machines do not have a lot of memory")
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
......@@ -830,7 +848,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
+ " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \
+ " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM -g " +gpuJob)
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
print(" - export REQUESTED_MEMORY="+memory)
print("\n")
......@@ -851,7 +869,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# do the test
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu" or g == "with-sm80-gpu"):
if (o == "openmp"):
if (cov == "no-coverage"):
openmp_threads=" 2 "
......
......@@ -758,6 +758,12 @@ if test x"${user_sets_nvidia_gpu_compute_capability}" = x"yes" ; then
AC_MSG_ERROR([Unknown Nvidia GPU compute capability set: ${withval}])
fi
fi
nvidia_a100_support=no
if test x"$cuda_compute_capability" = x"sm_80" ; then
nvidia_a100_support=yes
AC_DEFINE([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[1],[the NVIDIA GPU kernels for A100 can be used])
fi
AM_CONDITIONAL([WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY],[test x"${nvidia_a100_support}" = x"yes"])
AC_LANG_PUSH([Fortran])
dnl Test possibility of 'use mpi', if requested
......@@ -1016,6 +1022,12 @@ m4_define(elpa_m4_nvidia_gpu_kernels, [
complex_nvidia_gpu
])
m4_define(elpa_m4_nvidia_sm80_gpu_kernels, [
real_nvidia_sm80_gpu
complex_nvidia_sm80_gpu
])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
......@@ -1026,7 +1038,8 @@ m4_define(elpa_m4_intel_gpu_kernels, [
complex_intel_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu])
#m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu intel_gpu nvidia_sm80_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -1071,6 +1084,7 @@ ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_sm80_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([intel_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
......@@ -1125,6 +1139,17 @@ if test x"$with_nvidia_gpu_support_only" = x"yes" ; then
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(NVIDIA-sm_80_gpu-support-only, [AS_HELP_STRING([--with-NVIDIA-sm_80-gpu-support-only],
[Compile and always use the NVIDIA GPU version for compute capability >= sm_80])],
[],[with_nvidia_sm80_gpu_support_only=no])
if test x"$with_nvidia_sm80_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
AC_ARG_WITH(AMD-gpu-support-only, [AS_HELP_STRING([--with-AMD-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
......@@ -1221,7 +1246,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels elpa_m4_amd_gpu_kernels elpa_m4_intel_gpu_kernels nvidia_sm80_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1646,6 +1671,11 @@ if test x"${use_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
# not supported with old flag
#if test x"${nvidia_a100_support}" = x"yes" ; then
# use_real_nvidia_sm80_gpu=yes
# use_complex_nvidia_sm80_gpu=yes
#fi
fi
AC_MSG_CHECKING(whether NVIDIA-GPU version should be used)
......@@ -1663,6 +1693,10 @@ if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
if test x"${nvidia_a100_support}" = x"yes" ; then
use_real_nvidia_sm80_gpu=yes
use_complex_nvidia_sm80_gpu=yes
fi
fi
AC_MSG_CHECKING(whether NVIDIA cusolver library should be used)
......@@ -1879,6 +1913,16 @@ if test x"$use_real_nvidia_gpu" = x"yes" -o x"$use_complex_nvidia_gpu" = x"yes"
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
if test x"${nvidia_a100_support}" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_SM80_GPU_KERNEL],[1],[Nvidia sm_80 GPU kernel should be build])
# currently no complex kernel
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED=0
fi
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
......@@ -1907,6 +1951,8 @@ else
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED])
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
......
......@@ -72,7 +72,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_SM80_GPU, 41, @ELPA_2STAGE_REAL_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -108,7 +109,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU, 25, @ELPA_2STAGE_COMPLEX_NVIDIA_SM80_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -6,13 +6,14 @@
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define AMD_GPU_INSTR 9
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 16
#define NVIDIA_SM80_INSTR 9
#define AMD_GPU_INSTR 10
#define INTEL_GPU_INSTR 11
#define VSX_INSTR 12
#define ARCH64_INSTR 13
#define SPARC_INSTR 14
#define SVE128_INSTR 15
#define SVE256_INSTR 16
#define SVE512_INSTR 17
#define NUMBER_OF_INSTR 17
#define NUMBER_OF_INSTR 18
/* This file contains modified/adapted version of the original implementation kindly
* provided by NVIDIA under the MIT License. The unmodified version can be found
* in the src at src/shipped_srcs/NVIDIA_A100_kernel/
*
* Nov 2021, A. Marek, MPCDF
*
*/
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cub/cub.cuh>
#define USE_MMA // On Ampere the double precision tensor cores (DMMA) are available
#ifdef USE_MMA
#include "mma_m8n8k4_fp64_sm80.cuh"
#else
template<int bK, int bN>
__device__ inline int shared_memory_offset(int k, int n) {
// Shared memory layout for non-MMA version.
return k * bN + n;
}
__device__ inline constexpr int shared_memory_bytes(int bK, int bN) {
// Shared memory size for the bM by bK matrix. Version for the non-MMA.
return bN * bK;
}
#endif
/*
Householder transformation
This is based the on the original warp sync version shown above.
(I - tau * hh * hh^T) * q = q - tau * hh * hh^T * q
Name here : Name in paper
q : X
hh : v
hh_tau : tau
nev : N_C
nb : nbw (==b)
ncols : N_R (==n+b-1)
*/
template <typename T, int bM, int bN, int block_y, int block_z>
__global__ void compute_hh_trafo_gpu_new(T * __restrict__ q, const T * __restrict__ hh, const T * __restrict__ hh_tau, const int nev, const int nb, const int ldq, const int ncols)
{
constexpr int bK = bM;
extern __shared__ int smem[];
T *hh_s = reinterpret_cast<T *>(smem);
T *q_s = &hh_s[bM];
T *hh_tau_s = &q_s[shared_memory_bytes(bK, bN)];
#ifdef USE_MMA
T *sum_s = &hh_tau_s[1]; // Shared memory buffer if we perform the inner product with DMMA.
#endif
int j = ncols;
int bid = blockIdx.y * bN; // n-index offset for this block.
for (int k = threadIdx.z; k < bK; k += block_z) {
for (int n = threadIdx.y; n < bN; n += block_y) {
q_s[shared_memory_offset<bK, bN>(k, n)] = (n + bid) < nev ? q[(j + k - 1) * ldq + n + bid] : 0;
}
}
constexpr int thread_m_dim = bM / block_z;
constexpr int thread_n_dim = bN / block_y;
T reg[thread_n_dim * thread_m_dim];
while (j >= 1)
{
int hh_idx = threadIdx.z * blockDim.y + threadIdx.y;
if (hh_idx == 0) { *hh_tau_s = hh_tau[j - 1]; }
while (hh_idx < nb) {
hh_s[hh_idx] = hh[hh_idx + (j - 1) * nb];
hh_idx += blockDim.z * blockDim.y;
}
if (j < ncols && threadIdx.z == 0) {
for (int n = threadIdx.y; n < bN; n += block_y) {
q_s[shared_memory_offset<bK, bN>(0, n)] = (n + bid) < nev ? q[(j + 0 - 1) * ldq + n + bid] : 0;
}
}
/**
If we use DMMA to perform the inner product, call the routine here and store results on the buffer.
If not, for each eigenvector, for each thread we calculate the `sum`.
*/
#ifdef USE_MMA
__syncthreads();
sum<bK, bN, block_z * block_y / 32>(hh_s, q_s, sum_s);
__syncthreads();
#endif
#pragma unroll
for (int n = 0; n < thread_n_dim; n++) {
int n_idx = threadIdx.y + n * block_y;
#ifndef USE_MMA
T sum = 0;
#pragma unroll 1
for (int k = 0; k < bK; k++) {
sum += hh_s[k] * q_s[shared_memory_offset<bK, bN>(k, n_idx)];
}
#endif
#pragma unroll
for (int m = 0; m < thread_m_dim; m++) {
int m_idx = threadIdx.z + m * block_z;
#ifdef USE_MMA
reg[m * thread_n_dim + n] = q_s[shared_memory_offset<bK, bN>(m_idx, n_idx)] - *hh_tau_s * hh_s[m_idx] * sum_s[n_idx];
#else
reg[m * thread_n_dim + n] = q_s[shared_memory_offset<bK, bN>(m_idx, n_idx)] - *hh_tau_s * hh_s[m_idx] * sum;
#endif
if (j == 1 || m_idx == bM - 1) {
if (n_idx + bid < nev) { q[(m_idx + j - 1) * ldq + n_idx + bid] = reg[m * thread_n_dim + n]; }
}
}
}
__syncthreads();
#pragma unroll
for (int m = 0; m < thread_m_dim; m++) {
#pragma unroll
for (int n = 0; n < thread_n_dim; n++) {
int m_idx = threadIdx.z + m * block_z;
int n_idx = threadIdx.y + n * block_y;
if (m_idx + 1 < bM) { q_s[shared_memory_offset<bK, bN>(m_idx + 1, n_idx)] = reg[m * thread_n_dim + n]; }
}
}
j -= 1;
}
}
void set_max_shared_bytes(const void *func)
{
// Set such that this kernel can use the maximum shared memory available.
cudaFuncSetAttribute(func, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared);
int max_shared_bytes;
cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_bytes);
}
template <int bM, class F>
void launch_NVIDIA_sm80_kernel(F *q, const F *hh, const F *hh_tau, const int nev, const int nb, const int ldq, const int ncols)
{
#ifdef USE_MMA
// This is set such that shared memory bank conflicts are minimized.
constexpr int block_y = bM < 64 ? 8 : 4;
constexpr int block_z = bM < 64 ? 4 : 8;
#else
constexpr int block_y = 8;
constexpr int block_z = 4;
#endif
constexpr int bN = 8;
auto kernel = compute_hh_trafo_gpu_new<double, bM, bN, block_y, block_z>;
set_max_shared_bytes((const void *)kernel);
#ifdef USE_MMA
int shared_bytes = (bM + shared_memory_bytes(bM, bN) + bN + 1) * sizeof(F);
#else
int shared_bytes = (bM + shared_memory_bytes(bM, bN) + 1) * sizeof(F);
#endif
int grid_y = (nev + bN - 1) / bN;
kernel<<<dim3(1, grid_y, 1), dim3(1, block_y, block_z), shared_bytes>>>(q, hh, hh_tau, nev, nb, ldq, ncols);
}
/*
Name here : Name in paper
q : X
hh : v
hh_tau : tau
nev : N_C
nb : nbw (==b)
ncols : N_R (==n+b-1)
*/
extern "C" {
void launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double(double *q, const double *hh, const double *hh_tau, const int nev, const int nb, const int ldq, const int ncols)
{
switch (nb) {
case 1024: launch_NVIDIA_sm80_kernel<1024>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 512: launch_NVIDIA_sm80_kernel< 512>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 256: launch_NVIDIA_sm80_kernel< 256>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 128: launch_NVIDIA_sm80_kernel< 128>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 64: launch_NVIDIA_sm80_kernel< 64>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 32: launch_NVIDIA_sm80_kernel< 32>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 16: launch_NVIDIA_sm80_kernel< 16>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 8: launch_NVIDIA_sm80_kernel< 8>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
case 4: launch_NVIDIA_sm80_kernel< 4>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
//case 2: launch_new_kernel< 2>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
//case 1: launch_new_kernel< 1>(q, hh, hh_tau, nev, nb, ldq, ncols); break;
default: printf("Unsupported nb = %d for new kernel \n", nb);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("\n compute_hh_trafo sm80 CUDA kernel failed: %s \n",cudaGetErrorString(err));
}
}
void launch_compute_hh_trafo_c_cuda_sm80_kernel_real_single(float *q, const float *hh, const float *hh_tau, const int nev, const int nb, const int ldq, const int ncols) {
double *q_casted, *hh_casted, *hh_tau_casted;
q_casted = (double*) q;
hh_casted = (double*) hh;
hh_tau_casted = (double*) hh_tau;
launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double(q_casted, hh_casted, hh_tau_casted, nev, nb, ldq, ncols);
q = (float*) q_casted;
}
}
......@@ -59,6 +59,17 @@ module cuda_c_kernel
end subroutine
end interface
interface
subroutine launch_compute_hh_trafo_c_cuda_sm80_kernel_real_double(q, hh, hh_tau, nev, nb, ldq, ncols) &
bind(c)
use, intrinsic :: iso_c_binding
implicit none
integer(kind=c_int), value :: nev, nb, ldq, ncols
integer(kind=c_intptr_t), value :: q
integer(c_intptr_t), value :: hh_tau ,hh
end subroutine
end interface
#ifdef WANT_SINGLE_PRECISION_REAL
interface
subroutine launch_compute_hh_trafo_c_cuda_kernel_real_single(q, hh, hh_tau, nev, nb, ldq, ncols) &
......@@ -70,7 +81,19 @@ module cuda_c_kernel
integer(c_intptr_t), value :: hh_tau ,hh
end subroutine
end interface
#endif
interface
subroutine launch_compute_hh_trafo_c_cuda_sm80_kernel_real_single(q, hh, hh_tau, nev, nb, ldq, ncols) &
bind(c)
use, intrinsic :: iso_c_binding
implicit none
integer(kind=c_int), value :: nev, nb, ldq, ncols
integer(kind=c_intptr_t), value :: q
integer(c_intptr_t), value :: hh_tau ,hh
end subroutine
end interface
#endif /* WANT_SINGLE_PRECISION_REAL */
interface
subroutine launch_compute_hh_trafo_c_cuda_kernel_complex_double(q, hh, hh_tau, nev, nb, ldq, ncols) &
......@@ -83,6 +106,17 @@ module cuda_c_kernel
end subroutine
end interface
!interface
! subroutine launch_compute_hh_trafo_c_cuda_sm80_kernel_complex_double(q, hh, hh_tau, nev, nb, ldq, ncols) &
! bind(c)
! use, intrinsic :: iso_c_binding
! implicit none
! integer(kind=c_int), value :: nev, nb, ldq, ncols
! integer(kind=c_intptr_t), value :: q
! integer(kind=c_intptr_t), value :: hh_tau ,hh
! end subroutine
!end interface
#ifdef WANT_SINGLE_PRECISION_COMPLEX
interface