Commit 1b20384e authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'oneAPI' into ELPA_ROCm

parents 982b648f b40c7d47
......@@ -47,6 +47,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/GPU/check_for_gpu.F90 \
src/GPU/mod_vendor_agnostic_layer.F90 \
src/GPU/CUDA/mod_cuda.F90 \
src/GPU/INTEL/mod_mkl_offload.F90 \
src/GPU/ROCm/mod_hip.F90 \
src/elpa2/GPU/interface_c_gpu_kernel.F90 \
src/elpa2/GPU/CUDA/interface_c_cuda_kernel.F90 \
......@@ -145,6 +146,10 @@ if WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu
endif
if WITH_INTEL_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/INTEL/mkl_offload.cpp
endif
if WITH_AMD_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_real.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_complex.cpp
endif
......@@ -790,6 +795,7 @@ clean-local:
-rm -rf single_complex_2stage*.sh
-rm -rf single_real_2stage*.sh
-rm -rf double_instance_onenode*.sh
-rm -rf test_*.sh
-rm -rf $(generated_headers)
distclean-local:
......
......@@ -1480,9 +1480,11 @@ if test x"${need_bgq}" = x"yes"; then
fi
AC_LANG_POP([Fortran])
#compatibiility flag
AC_MSG_CHECKING(whether GPU version should be used)
AC_ARG_ENABLE([gpu],
AS_HELP_STRING([--enable-gpu],
[do use GPU version (just for backward compatibility)]),
[do use Nvidia GPU version (compatibility flag, better set explicitely)]),
[if test x"$enableval" = x"yes"; then
use_gpu=yes
else
......@@ -1496,7 +1498,6 @@ if test x"${use_gpu}" = x"yes" ; then
use_complex_nvidia_gpu=yes
fi
AC_MSG_CHECKING(whether Nvidia-GPU version should be used)
AC_ARG_ENABLE([Nvidia-gpu],
AS_HELP_STRING([--enable-Nvidia-gpu],
......@@ -1510,11 +1511,29 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
AC_ARG_ENABLE([intel-gpu],
AS_HELP_STRING([--enable-intel-gpu],
[do use INTEL GPU version]),
[if test x"$enableval" = x"yes"; then
use_intel_gpu=yes
else
use_intel_gpu=no
fi],
[use_intel_gpu=no])
AC_MSG_RESULT([${use_intel_gpu}])
if test x"${use_intel_gpu}" = x"yes" ; then
need_intel_gpu=yes
use_real_intel_gpu=yes
use_complex_intel_gpu=yes
fi
AC_MSG_CHECKING(whether AMD-GPU version should be used)
AC_ARG_ENABLE([AMD-gpu],
AS_HELP_STRING([--enable-AMD-gpu],
......@@ -1693,7 +1712,18 @@ fi
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
AC_DEFINE([WITH_INTEL_GPU_VERSION],[1],[enable INTEL GPU support])
AC_DEFINE([WITH_INTEL_GPU_KERNEL],[1],[INTEL GPU kernel should be build])
#ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=1
#ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=0
ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_INTEL_GPU_COMPILED])
LT_INIT
......@@ -2027,7 +2057,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no
need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
......
......@@ -52,26 +52,27 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AMD_GPU, 19, @ELPA_2STAGE_REAL_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 26, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 27, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 28, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_INTEL_GPU, 20, @ELPA_2STAGE_REAL_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 23, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 26, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 27, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 28, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 29, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 32, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 35, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -106,7 +107,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -7,11 +7,12 @@
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define AMD_GPU_INSTR 9
#define VSX_INSTR 10
#define ARCH64_INSTR 11
#define SPARC_INSTR 12
#define SVE128_INSTR 13
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 15
#define SVE512_INSTR 16
#define NUMBER_OF_INSTR 16
#define NUMBER_OF_INSTR 17
......@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part": "-DTEST_SCALAPACK_PART",
}
gpu_flag = {
"GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_AMD_GPU=0",
"GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0 -DTEST_AMD_GPU=0",
"NVIDIA_GPU_ON": "-DTEST_NVIDIA_GPU=1",
"INTEL_GPU_ON": "-DTEST_INTEL_GPU=1",
"AMD_GPU_ON": "-DTEST_AMD_GPU=1",
}
gpu_id_flag = {
......@@ -87,14 +88,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors")):
if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors")):
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if(m == "frank" and ((t != "eigenvectors" or t != "eigenvalues") and (d != "real" or p != "double"))):
continue
if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors" or m != "analytic")):
if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON" or t != "eigenvectors" or m != "analytic")):
continue
# do not test single-precision scalapack
......@@ -128,7 +129,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
# qr only for 2stage real
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or g == "AMD_GPU_ON" or m != "random")):
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or "INTEL_GPU_ON" or g == "AMD_GPU_ON" or m != "random")):
continue
if(spl == "myself" and (d != "real" or p != "double" or q != 0 or m != "random" or (t != "eigenvectors" and t != "cholesky") or lang != "Fortran" or lay != "square")):
......@@ -159,6 +160,10 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print("if WITH_NVIDIA_GPU_VERSION")
endifs += 1
if (g == "INTEL_GPU_ON"):
print("if WITH_INTEL_GPU_VERSION")
endifs += 1
if (g == "AMD_GPU_ON"):
print("if WITH_AMD_GPU_VERSION")
endifs += 1
......@@ -195,7 +200,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "AMD_GPU_ON") else "",
gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or g == "AMD_GPU_ON") else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "",
m=m,
......
#include <iostream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
//#include <complex>
#include "config.h"
#ifdef WITH_INTEL_GPU_VERSION
#include "mkl.h"
#include "mkl_omp_offload.h"
#include "mkl_types.h"
//#include <omp.h>
#endif
//#define MKL_Complex16 std::complex<double>
//#define MKL_Complex8 std::complex<float>
extern "C" {
void mkl_offload_dgemm_c(char transa, char transb, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_dgemm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb, sizec;
std::cout << "m=" << m << "lda=" << lda << "ldc=" << ldc << std::endl;
std::cout << "n=" << n << "ldb=" << ldb << std::endl;
std::cout << "k=" << k << std::endl;
std::cout << "alpha=" << alpha << std::endl;
std::cout << "beta=" << beta << std::endl;
std::cout << "Transa=" << transa << std::endl;
std::cout << "Transb=" << transb << std::endl;
sizea = lda * k;
sizeb = ldb * n;
sizec = ldc * n;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
dgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
}
std::cout << "leaving mkl_offload_dgemm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_dgemm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_dgemv_c(char trans, int m, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_dgemv" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizex, sizey;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std::cout << "alpha=" << alpha << std::endl;
std::cout << "beta=" << beta << std::endl;
std::cout << "Trans=" << trans << std::endl;
sizea = lda * n;
sizex = n;
sizey = m;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
dgemv(&trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
}
std::cout << "leaving mkl_offload_dgemv" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_dgemv without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_dtrmm_c(char side, char uplo, char trans, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_dtrmm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std::cout << "alpha=" << alpha << std::endl;
if (side == 'L' || side == 'l') {
std::cout << "Setting a to case L" << std::endl;
sizea = lda * m;
}
if (side == 'R' || side == 'r') {
std::cout << "Setting a to case R" << std::endl;
sizea = lda * n;
}
sizeb = ldb * n;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
dtrmm(&side, &uplo, &trans, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
}
std::cout << "leaving mkl_offload_dtrmm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_dtrmm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
#ifdef WANT_SINGLE_PRECISION_REAL
void mkl_offload_sgemm_c(char transa, char transb, int m, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_sgemm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb, sizec;
std::cout << "m=" << m << "lda=" << lda << "ldc=" << ldc << std::endl;
std::cout << "n=" << n << "ldb=" << ldb << std::endl;
std::cout << "k=" << k << std::endl;
std::cout << "alpha=" << alpha << std::endl;
std::cout << "beta=" << beta << std::endl;
std::cout << "Transa=" << transa << std::endl;
std::cout << "Transb=" << transb << std::endl;
sizea = lda * k;
sizeb = ldb * n;
sizec = ldc * n;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
sgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
}
std::cout << "leaving mkl_offload_sgemm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_sgemm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_sgemv_c(char trans, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_sgemv" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizex, sizey;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std::cout << "alpha=" << alpha << std::endl;
std::cout << "beta=" << beta << std::endl;
std::cout << "Trans=" << trans << std::endl;
sizea = lda * n;
sizex = n;
sizey = m;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
sgemv(&trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
}
std::cout << "leaving mkl_offload_sgemv" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_sgemv without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_strmm_c(char side, char uplo, char trans, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_strmm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std::cout << "alpha=" << alpha << std::endl;
if (side == 'L' || side == 'l') {
std::cout << "Setting a to case L" << std::endl;
sizea = lda * m;
}
if (side == 'R' || side == 'r') {
std::cout << "Setting a to case R" << std::endl;
sizea = lda * n;
}
sizeb = ldb * n;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
strmm(&side, &uplo, &trans, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
}
std::cout << "leaving mkl_offload_strmm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_strmm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
#endif /* WANT_SINGLE_PRECISION_REAL */
void mkl_offload_zgemm_c(char transa, char transb, int m, int n, int k, MKL_Complex16 alpha, MKL_Complex16 *a, int lda, MKL_Complex16 *b, int ldb, MKL_Complex16 beta, MKL_Complex16 *c, int ldc) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_zgemm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb, sizec;
std::cout << "m=" << m << "lda=" << lda << "ldc=" << ldc << std::endl;
std::cout << "n=" << n << "ldb=" << ldb << std::endl;
std::cout << "k=" << k << std::endl;
std::cout << "Transa=" << transa << std::endl;
std::cout << "Transb=" << transb << std::endl;
sizea = lda * k;
sizeb = ldb * n;
sizec = ldc * n;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
zgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
}
std::cout << "leaving mkl_offload_zgemm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_zgemm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_zgemv_c(char trans, int m, int n, MKL_Complex16 alpha, MKL_Complex16 *a, int lda, MKL_Complex16 *x, int incx, MKL_Complex16 beta, MKL_Complex16 *y, int incy) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_zgemv" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizex, sizey;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std::cout << "Trans=" << trans << std::endl;
sizea = lda * n;
sizex = n;
sizey = m;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
zgemv(&trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
}
std::cout << "leaving mkl_offload_zgemv" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_zgemv without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
void mkl_offload_ztrmm_c(char side, char uplo, char trans, char diag, int m, int n, MKL_Complex16 alpha, MKL_Complex16 *a, int lda, MKL_Complex16 *b, int ldb) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_ztrmm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb;
std::cout << "m=" << m << "lda=" << lda << std::endl;
std::cout << "n=" << n << std::endl;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
if (side == 'L' || side == 'l') {
std::cout << "Setting a to case L" << std::endl;
sizea = lda * m;
}
if (side == 'R' || side == 'r') {
std::cout << "Setting a to case R" << std::endl;
sizea = lda * n;
}
sizeb = ldb * n;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
ztrmm(&side, &uplo, &trans, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
}
std::cout << "leaving mkl_offload_ztrmm" << std::endl;
#else
std::cout << "ERROR: calling mkl_offload_ztrmm without build for Intel GPU support!" << std::endl;
std::cout << "ERROR: You should never see this message" << std::endl;
#endif
}
#ifdef WANT_SINGLE_PRECISION_COMPLEX
void mkl_offload_cgemm_c(char transa, char transb, int m, int n, int k, MKL_Complex8 alpha, MKL_Complex8 *a, int lda, MKL_Complex8 *b, int ldb, MKL_Complex8 beta, MKL_Complex8 *c, int ldc) {
#ifdef WITH_INTEL_GPU_VERSION
std::cout << "In mkl_offload_cgemm" << std::endl;
// at a later time the device should be set differently
int dnum = 0;
int sizea, sizeb, sizec;
std::cout << "m=" << m << "lda=" << lda << "ldc=" << ldc << std::endl;
std::cout << "n=" << n << "ldb=" << ldb << std::endl;
std::cout << "k=" << k << std::endl;
std::cout << "Transa=" << transa << std::endl;
std::cout << "Transb=" << transb << std::endl;
sizea = lda * k;
sizeb = ldb * n;
sizec = ldc * n;