Commit b40c7d47 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'ELPA_ROCm' into oneAPI

parents ee776ac8 9407142d
This diff is collapsed.
......@@ -789,6 +789,7 @@ CLEANFILES = \
clean-local:
-rm -rf modules/* private_modules/* test_modules/* .fortran_dependencies/*
-rm -rf validate_*.sh
-rm -rf test_*.sh
-rm -rf real_2stage*.sh
-rm -rf complex_2stage*.sh
-rm -rf single_complex_2stage*.sh
......
......@@ -346,7 +346,7 @@ ilp64_no_omp_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -364,7 +364,7 @@ ilp64_no_omp_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -388,7 +388,7 @@ ilp64_no_omp_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -406,7 +406,7 @@ ilp64_no_omp_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -430,7 +430,7 @@ matrix_redistribute_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -448,7 +448,7 @@ matrix_redistribute_mpi_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -472,7 +472,7 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -486,7 +486,7 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP\\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP\\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m 150 -n 50 -b 16 '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
......@@ -496,13 +496,13 @@ python_ci_tests = [
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests "'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests "'
'-d " CC=\\\\\\\"mpiicc\\\\\\\" CFLAGS=\\\\\\\"-O3 -xAVX\\\\\\\" '
'FC=\\\\\\\"mpiifort\\\\\\\" FCFLAGS=\\\\\\\"-O3 -xAVX\\\\\\\" '
'SCALAPACK_LDFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP \\\" '
'SCALAPACK_FCFLAGS=\\\"$MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP \\\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --enable-python --enable-python-tests'
'--disable-nvidia-gpu --enable-avx --enable-python --enable-python-tests'
'" -t $MPI_TASKS -m 150 -n 50 -b 16 -S $SLURM || { chmod u+rwX -R . ; exit 1 ; }',
"\n",
]
......@@ -597,8 +597,8 @@ band_to_full_blocking = {
}
gpu = {
"no-gpu" : "--disable-gpu",
"with-gpu" : "--enable-gpu --with-cuda-path=\\$CUDA_HOME/",
"no-gpu" : "--disable-nvidia-gpu",
"with-gpu" : "--enable-nvidia-gpu --with-cuda-path=\\$CUDA_HOME/",
}
......
......@@ -52,26 +52,27 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AMD_GPU, 19, @ELPA_2STAGE_REAL_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 26, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 27, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 28, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_INTEL_GPU, 20, @ELPA_2STAGE_REAL_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 23, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 26, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 27, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 28, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 29, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 32, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 35, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -106,7 +107,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -6,11 +6,13 @@
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define SVE128_INSTR 12
#define SVE256_INSTR 13
#define SVE512_INSTR 14
#define AMD_GPU_INSTR 9
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 16
#define NUMBER_OF_INSTR 15
#define NUMBER_OF_INSTR 17
......@@ -21,9 +21,10 @@ solver_flag = {
"scalapack_part": "-DTEST_SCALAPACK_PART",
}
gpu_flag = {
"GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0",
"GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0 -DTEST_AMD_GPU=0",
"NVIDIA_GPU_ON": "-DTEST_NVIDIA_GPU=1",
"INTEL_GPU_ON": "-DTEST_INTEL_GPU=1",
"AMD_GPU_ON": "-DTEST_AMD_GPU=1",
}
gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0",
......@@ -87,14 +88,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors")):
if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors")):
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if(m == "frank" and ((t != "eigenvectors" or t != "eigenvalues") and (d != "real" or p != "double"))):
continue
if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors" or m != "analytic")):
if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors" or m != "analytic")):
continue
# do not test single-precision scalapack
......@@ -128,7 +129,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
# qr only for 2stage real
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or "INTEL_GPU_ON" or m != "random")):
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or "INTEL_GPU_ON" or g == "AMD_GPU_ON" or m != "random")):
continue
if(spl == "myself" and (d != "real" or p != "double" or q != 0 or m != "random" or (t != "eigenvectors" and t != "cholesky") or lang != "Fortran" or lay != "square")):
......@@ -163,6 +164,10 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print("if WITH_INTEL_GPU_VERSION")
endifs += 1
if (g == "AMD_GPU_ON"):
print("if WITH_AMD_GPU_VERSION")
endifs += 1
if (lay == "all_layouts"):
print("if WITH_MPI")
endifs += 1
......@@ -195,7 +200,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON") else "",
gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON") else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "",
m=m,
......
......@@ -91,6 +91,8 @@ module simd_kernel
realKernels_to_simdTable(ELPA_2STAGE_REAL_SVE512_BLOCK4) = SVE512_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SVE512_BLOCK6) = SVE512_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NVIDIA_GPU) = NVIDIA_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AMD_GPU) = AMD_GPU_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_INTEL_GPU) = INTEL_GPU_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR
......@@ -117,19 +119,21 @@ module simd_kernel
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC
simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2
simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_NVIDIA_GPU
simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
simdTable_to_realKernels(SVE128_INSTR) = ELPA_2STAGE_REAL_SVE128_BLOCK2
simdTable_to_realKernels(SVE256_INSTR) = ELPA_2STAGE_REAL_SVE256_BLOCK2
simdTable_to_realKernels(SVE512_INSTR) = ELPA_2STAGE_REAL_SVE512_BLOCK2
simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC
simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2
simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_NVIDIA_GPU
simdTable_to_realKernels(AMD_GPU_INSTR) = ELPA_2STAGE_REAL_AMD_GPU
simdTable_to_realKernels(INTEL_GPU_INSTR) = ELPA_2STAGE_REAL_INTEL_GPU
simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
simdTable_to_realKernels(SVE128_INSTR) = ELPA_2STAGE_REAL_SVE128_BLOCK2
simdTable_to_realKernels(SVE256_INSTR) = ELPA_2STAGE_REAL_SVE256_BLOCK2
simdTable_to_realKernels(SVE512_INSTR) = ELPA_2STAGE_REAL_SVE512_BLOCK2
kernel = simdTable_to_realKernels(simd_set_index)
......@@ -164,6 +168,8 @@ module simd_kernel
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1) = ARCH64_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2) = ARCH64_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NVIDIA_GPU) = NVIDIA_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AMD_GPU) = AMD_GPU_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_INTEL_GPU) = INTEL_GPU_INSTR
simd_set_index = complexKernels_to_simdTable(kernel)
......@@ -177,17 +183,19 @@ module simd_kernel
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
simdTable_to_complexKernels(SVE128_INSTR) = ELPA_2STAGE_COMPLEX_SVE128_BLOCK1
simdTable_to_complexKernels(SVE256_INSTR) = ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
simdTable_to_complexKernels(SVE512_INSTR) = ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
simdTable_to_complexKernels(ARCH64_INSTR) = ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
simdTable_to_complexKernels(SVE128_INSTR) = ELPA_2STAGE_COMPLEX_SVE128_BLOCK1
simdTable_to_complexKernels(SVE256_INSTR) = ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
simdTable_to_complexKernels(SVE512_INSTR) = ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
simdTable_to_complexKernels(ARCH64_INSTR) = ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
simdTable_to_complexKernels(AMD_GPU_INSTR) = ELPA_2STAGE_COMPLEX_AMD_GPU
simdTable_to_complexKernels(INTEL_GPU_INSTR) = ELPA_2STAGE_COMPLEX_INTEL_GPU
kernel = simdTable_to_complexKernels(simd_set_index)
......
......@@ -127,6 +127,14 @@
#define TEST_C_INT_MPI_TYPE int
#define C_INT_MPI_TYPE int
#endif
#define TEST_GPU 0
#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
#undef TEST_GPU
#define TEST_GPU 1
#endif
#include "test/shared/generated.h"
int main(int argc, char** argv) {
......@@ -273,19 +281,18 @@ int main(int argc, char** argv) {
#endif
assert_elpa_ok(error_elpa);
#if defined(TEST_NVIDIA_GPU)
elpa_set(handle, "nvidia-gpu", TEST_NVIDIA_GPU, &error_elpa);
assert_elpa_ok(error_elpa);
#else
elpa_set(handle, "nvidia-gpu", 0, &error_elpa);
#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0)
elpa_set(handle, "nvidia-gpu", TEST_GPU, &error_elpa);
assert_elpa_ok(error_elpa);
#endif
#if defined(TEST_INTEL_GPU)
printf("Setting INTEL GPU\n");
elpa_set(handle, "intel-gpu", TEST_INTEL_GPU, &error_elpa);
#if TEST_AMD_GPU == 1
elpa_set(handle, "amd-gpu", TEST_GPU, &error_elpa);
assert_elpa_ok(error_elpa);
#else
elpa_set(handle, "intel-gpu", 0, &error_elpa);
#endif
#if TEST_INTEL_GPU == 1
elpa_set(handle, "intel-gpu", TEST_GPU, &error_elpa);
assert_elpa_ok(error_elpa);
#endif
......
......@@ -47,6 +47,7 @@
! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
! Define TEST_NVIDIA_GPU \in [0, 1]
! Define TEST_INTEL_GPU \in [0, 1]
! Define TEST_AMD_GPU \in [0, 1]
! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
......@@ -117,6 +118,13 @@ error: define either TEST_ALL_KERNELS or a valid TEST_KERNEL
#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
#define INT_MPI_TYPE c_int32_t
#endif
#define TEST_GPU 0
#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
#undef TEST_GPU
#define TEST_GPU 1
#endif
#include "assert.h"
program test
......@@ -281,7 +289,7 @@ program test
#if TEST_QR_DECOMPOSITION == 1
#if (TEST_NVIDIA_GPU == 1) || (TEST_INTEL_GPU == 1)
#if (TEST_NVIDIA_GPU == 1) || (TEST_INTEL_GPU == 1) || (TEST_AMD_GPU == 1)
#ifdef WITH_MPI
call mpi_finalize(mpierr)
#endif
......@@ -641,23 +649,19 @@ program test
#endif
assert_elpa_ok(error_elpa)
#if (TEST_NVIDIA_GPU == 1)
call e%set("nvidia-gpu", TEST_NVIDIA_GPU, error_elpa)
#else
call e%set("nvidia-gpu", 0, error_elpa)
#endif
#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
call e%set("nvidia-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#if (TEST_INTEL_GPU == 1)
call e%set("intel-gpu", TEST_INTEL_GPU, error_elpa)
#else
call e%set("intel-gpu", 0 , error_elpa)
#endif
#if TEST_AMD_GPU == 1
call e%set("amd-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#if (TEST_NVIDIA_GPU == 1)
stop "AAAAAAAAAAAAAAA"
#endif
#if (TEST_INTEL_GPU == 1)
print *,"Using intel gpu"
#if TEST_INTEL_GPU == 1
call e%set("intel-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if (TEST_GPU_SET_ID == 1) && (TEST_INTEL_GPU == 0)
......@@ -685,11 +689,15 @@ program test
#ifdef TEST_ALL_KERNELS
do i = 0, elpa_option_cardinality(KERNEL_KEY) ! kernels
#if (TEST_NVIDIA_GPU == 0) && (TEST_INTEL_GPU == 0)
#if (TEST_NVIDIA_GPU == 0) && (TEST_INTEL_GPU == 0) && (TEST_AMD_GPU == 0)
!if (TEST_GPU .eq. 0) then
kernel = elpa_option_enumerate(KERNEL_KEY, int(i,kind=c_int))
if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) continue
if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) continue
if (kernel .eq. ELPA_2STAGE_REAL_AMD_GPU) continue
if (kernel .eq. ELPA_2STAGE_COMPLEX_AMD_GPU) continue
if (kernel .eq. ELPA_2STAGE_REAL_INTEL_GPU) continue
if (kernel .eq. ELPA_2STAGE_COMPLEX_INTEL_GPU) continue
!endif
#endif
#endif
......@@ -701,31 +709,29 @@ program test
#ifdef TEST_SOLVER_2STAGE
#if TEST_NVIDIA_GPU == 1
#if defined TEST_REAL
#if (TEST_NVIDIA_GPU == 1)
kernel = ELPA_2STAGE_REAL_NVIDIA_GPU
#endif
#if defined TEST_COMPLEX
kernel = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
#endif
#endif /* TEST_NVIDIA_GPU == 1 */
#if TEST_AMD_GPU == 1
#if defined TEST_REAL
#if (TEST_AMD_GPU == 1)
kernel = ELPA_2STAGE_REAL_AMD_GPU
#endif
#if defined TEST_COMPLEX
kernel = ELPA_2STAGE_COMPLEX_AMD_GPU
#if (TEST_INTEL_GPU == 1)
kernel = ELPA_2STAGE_REAL_INTEL_GPU
#endif
#endif /* TEST_AMD_GPU == 1 */
#endif /* TEST_REAL */
#if TEST_INTEL_GPU == 1
#if defined TEST_REAL
!kernel = ELPA_2STAGE_REAL_INTEL_GPU
#endif
#if defined TEST_COMPLEX
!kernel = ELPA_2STAGE_COMPLEX_INTEL_GPU
#if (TEST_NVIDIA_GPU == 1)
kernel = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
#endif
#if (TEST_AMD_GPU == 1)
kernel = ELPA_2STAGE_COMPLEX_AMD_GPU
#endif
#if (TEST_INTEL_GPU == 1)
kernel = ELPA_2STAGE_COMPLEX_INTEL_GPU
#endif
#endif /* TEST_COMPLEX */
#endif /* TEST_GPU == 1 */
call e%set(KERNEL_KEY, kernel, error_elpa)
......
......@@ -47,6 +47,7 @@
! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
! Define TEST_NVIDIA_GPU \in [0, 1]
! Define TEST_INTEL_GPU \in [0, 1]
! Define TEST_AMD_GPU \in [0, 1]
! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
......@@ -95,6 +96,16 @@ error: define exactly one of TEST_SINGLE or TEST_DOUBLE
#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
#define INT_MPI_TYPE c_int32_t
#endif
#define TEST_GPU 0
#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1)
#undef TEST_GPU
#define TEST_GPU 1
#endif
#include "assert.h"
program test
......@@ -229,10 +240,19 @@ program test
call e%set("debug",1, error_elpa)
assert_elpa_ok(error_elpa)
#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
call e%set("nvidia-gpu", 0, error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_AMD_GPU == 1
call e%set("amd-gpu", 0, error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_INTEL_GPU == 1
call e%set("intel-gpu", 0, error_elpa)
assert_elpa_ok(error_elpa)
#endif
!call e%set("max_stored_rows", 15, error_elpa)
assert_elpa_ok(e%setup())
......
......@@ -47,6 +47,7 @@
! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
! Define TEST_NVIDIA_GPU \in [0, 1]
! Define TEST_INTEL_GPU \in [0, 1]
! Define TEST_AMD_GPU \in [0, 1]
! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
......@@ -96,6 +97,11 @@ error: define exactly one of TEST_SINGLE or TEST_DOUBLE
#define INT_MPI_TYPE c_int32_t
#endif
#define TEST_GPU 0
#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
#undef TEST_GPU
#define TEST_GPU 1
#endif
#include "assert.h"
......@@ -210,10 +216,20 @@ program test
call e1%set("debug",1, error_elpa)
assert_elpa_ok(error_elpa)
call e1%set("nvidia-gpu", 0, error_elpa)
#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
call e1%set("nvidia-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_AMD_GPU == 1
call e1%set("amd-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
call e1%set("intel-gpu", 0, error_elpa)
#endif
#if TEST_INTEL_GPU == 1
call e1%set("intel-gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#endif
!call e1%set("max_stored_rows", 15, error_elpa)
assert_elpa_ok(e1%setup())
......@@ -241,10 +257,18 @@ program test
assert_elpa_ok(error_elpa)
call e2%get("debug", int(debug,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
call e2%get("nvidia-gpu", int(gpu,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_AMD_GPU == 1
call e2%get("amd-gpu", int(gpu,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_INTEL_GPU == 1
call e2%get("intel-gpu", int(gpu,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#endif
if ((timings .ne. 1) .or. (debug .ne. 1) .or. (gpu .ne. 0)) then
print *, "Parameters not stored or loaded correctly. Aborting...", timings, debug, gpu
......
......@@ -47,6 +47,7 @@
! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
! Define TEST_NVIDIA_GPU \in [0, 1]
! Define TEST_INTEL_GPU \in [0, 1]
! Define TEST_AMD_GPU \in [0, 1]
! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
......@@ -96,6 +97,13 @@ error: define exactly one of TEST_SINGLE or TEST_DOUBLE
#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
#define INT_MPI_TYPE c_int32_t
#endif
#define TEST_GPU 0
#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
#undef TEST_GPU
#define TEST_GPU 1