Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1b20384e
Commit
1b20384e
authored
Mar 11, 2021
by
Andreas Marek
Browse files
Merge branch 'oneAPI' into ELPA_ROCm
parents
982b648f
b40c7d47
Changes
30
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
1b20384e
...
...
@@ -47,6 +47,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/GPU/check_for_gpu.F90
\
src/GPU/mod_vendor_agnostic_layer.F90
\
src/GPU/CUDA/mod_cuda.F90
\
src/GPU/INTEL/mod_mkl_offload.F90
\
src/GPU/ROCm/mod_hip.F90
\
src/elpa2/GPU/interface_c_gpu_kernel.F90
\
src/elpa2/GPU/CUDA/interface_c_cuda_kernel.F90
\
...
...
@@ -145,6 +146,10 @@ if WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_nvidia_gpu_complex.cu
endif
if
WITH_INTEL_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/INTEL/mkl_offload.cpp
endif
if
WITH_AMD_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_real.cpp src/elpa2/GPU/ROCm/ev_tridi_band_amd_gpu_complex.cpp
endif
...
...
@@ -790,6 +795,7 @@ clean-local:
-
rm
-rf
single_complex_2stage
*
.sh
-
rm
-rf
single_real_2stage
*
.sh
-
rm
-rf
double_instance_onenode
*
.sh
-
rm
-rf
test_
*
.sh
-
rm
-rf
$(generated_headers)
distclean-local
:
...
...
configure.ac
View file @
1b20384e
...
...
@@ -1480,9 +1480,11 @@ if test x"${need_bgq}" = x"yes"; then
fi
AC_LANG_POP([Fortran])
#compatibiility flag
AC_MSG_CHECKING(whether GPU version should be used)
AC_ARG_ENABLE([gpu],
AS_HELP_STRING([--enable-gpu],
[do use GPU version (
just for backward compatibilit
y)]),
[do use
Nvidia
GPU version (
compatibility flag, better set explicitel
y)]),
[if test x"$enableval" = x"yes"; then
use_gpu=yes
else
...
...
@@ -1496,7 +1498,6 @@ if test x"${use_gpu}" = x"yes" ; then
use_complex_nvidia_gpu=yes
fi
AC_MSG_CHECKING(whether Nvidia-GPU version should be used)
AC_ARG_ENABLE([Nvidia-gpu],
AS_HELP_STRING([--enable-Nvidia-gpu],
...
...
@@ -1510,11 +1511,29 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_
real_
nvidia_gpu=yes
use_complex
_nvidia
_gpu=yes
use_nvidia_
real_
gpu=yes
use_
nvidia_
complex_gpu=yes
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
AC_ARG_ENABLE([intel-gpu],
AS_HELP_STRING([--enable-intel-gpu],
[do use INTEL GPU version]),
[if test x"$enableval" = x"yes"; then
use_intel_gpu=yes
else
use_intel_gpu=no
fi],
[use_intel_gpu=no])
AC_MSG_RESULT([${use_intel_gpu}])
if test x"${use_intel_gpu}" = x"yes" ; then
need_intel_gpu=yes
use_real_intel_gpu=yes
use_complex_intel_gpu=yes
fi
AC_MSG_CHECKING(whether AMD-GPU version should be used)
AC_ARG_ENABLE([AMD-gpu],
AS_HELP_STRING([--enable-AMD-gpu],
...
...
@@ -1693,7 +1712,18 @@ fi
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
AC_DEFINE([WITH_INTEL_GPU_VERSION],[1],[enable INTEL GPU support])
AC_DEFINE([WITH_INTEL_GPU_KERNEL],[1],[INTEL GPU kernel should be build])
#ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=1
#ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=1
else
ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED=0
ELPA_2STAGE_REAL_INTEL_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_INTEL_GPU_COMPILED])
LT_INIT
...
...
@@ -2027,7 +2057,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=
no
need_manual_cpp=
yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
...
...
elpa/elpa_constants.h.in
View file @
1b20384e
...
...
@@ -52,26 +52,27 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AMD_GPU, 19, @ELPA_2STAGE_REAL_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 26, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 27, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 28, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_INTEL_GPU, 20, @ELPA_2STAGE_REAL_INTEL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 23, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 26, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 27, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 28, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 29, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 32, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 35, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 38, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 40, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
@@ -106,7 +107,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_INTEL_GPU, 24, @ELPA_2STAGE_COMPLEX_INTEL_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
elpa/elpa_simd_constants.h
View file @
1b20384e
...
...
@@ -7,11 +7,12 @@
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define AMD_GPU_INSTR 9
#define VSX_INSTR 10
#define ARCH64_INSTR 11
#define SPARC_INSTR 12
#define SVE128_INSTR 13
#define INTEL_GPU_INSTR 10
#define VSX_INSTR 11
#define ARCH64_INSTR 12
#define SPARC_INSTR 13
#define SVE128_INSTR 14
#define SVE256_INSTR 15
#define SVE512_INSTR 1
5
#define SVE512_INSTR 1
6
#define NUMBER_OF_INSTR 1
6
#define NUMBER_OF_INSTR 1
7
generate_automake_test_programs.py
View file @
1b20384e
...
...
@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part"
:
"-DTEST_SCALAPACK_PART"
,
}
gpu_flag
=
{
"GPU_OFF"
:
"-DTEST_NVIDIA_GPU=0 -DTEST_AMD_GPU=0"
,
"GPU_OFF"
:
"-DTEST_NVIDIA_GPU=0
-DTEST_INTEL_GPU=0
-DTEST_AMD_GPU=0"
,
"NVIDIA_GPU_ON"
:
"-DTEST_NVIDIA_GPU=1"
,
"INTEL_GPU_ON"
:
"-DTEST_INTEL_GPU=1"
,
"AMD_GPU_ON"
:
"-DTEST_AMD_GPU=1"
,
}
gpu_id_flag
=
{
...
...
@@ -87,14 +88,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
if
(
m
==
"analytic"
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
t
!=
"eigenvectors"
)):
if
(
m
==
"analytic"
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
t
!=
"eigenvectors"
)):
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if
(
m
==
"frank"
and
((
t
!=
"eigenvectors"
or
t
!=
"eigenvalues"
)
and
(
d
!=
"real"
or
p
!=
"double"
))):
continue
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
continue
# do not test single-precision scalapack
...
...
@@ -128,7 +129,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
# qr only for 2stage real
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
"NVIDIA_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
m
!=
"random"
)):
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
"NVIDIA_GPU_ON"
or
"INTEL_GPU_ON"
or
g
==
"AMD_GPU_ON"
or
m
!=
"random"
)):
continue
if
(
spl
==
"myself"
and
(
d
!=
"real"
or
p
!=
"double"
or
q
!=
0
or
m
!=
"random"
or
(
t
!=
"eigenvectors"
and
t
!=
"cholesky"
)
or
lang
!=
"Fortran"
or
lay
!=
"square"
)):
...
...
@@ -159,6 +160,10 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print
(
"if WITH_NVIDIA_GPU_VERSION"
)
endifs
+=
1
if
(
g
==
"INTEL_GPU_ON"
):
print
(
"if WITH_INTEL_GPU_VERSION"
)
endifs
+=
1
if
(
g
==
"AMD_GPU_ON"
):
print
(
"if WITH_AMD_GPU_VERSION"
)
endifs
+=
1
...
...
@@ -195,7 +200,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix
=
language_flag
[
lang
],
d
=
d
,
p
=
p
,
t
=
t
,
s
=
s
,
kernelsuffix
=
""
if
kernel
==
"nokernel"
else
"_"
+
kernel
,
gpusuffix
=
"gpu_"
if
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"AMD_GPU_ON"
)
else
""
,
gpusuffix
=
"gpu_"
if
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
g
==
"AMD_GPU_ON"
)
else
""
,
gpuidsuffix
=
"set_gpu_id_"
if
gid
else
""
,
qrsuffix
=
"qr_"
if
q
else
""
,
m
=
m
,
...
...
src/GPU/INTEL/mkl_offload.cpp
0 → 100644
View file @
1b20384e
#include <iostream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
//#include <complex>
#include "config.h"
#ifdef WITH_INTEL_GPU_VERSION
#include "mkl.h"
#include "mkl_omp_offload.h"
#include "mkl_types.h"
//#include <omp.h>
#endif
//#define MKL_Complex16 std::complex<double>
//#define MKL_Complex8 std::complex<float>
extern
"C"
{
void
mkl_offload_dgemm_c
(
char
transa
,
char
transb
,
int
m
,
int
n
,
int
k
,
double
alpha
,
double
*
a
,
int
lda
,
double
*
b
,
int
ldb
,
double
beta
,
double
*
c
,
int
ldc
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_dgemm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
,
sizec
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
"ldc="
<<
ldc
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
"ldb="
<<
ldb
<<
std
::
endl
;
std
::
cout
<<
"k="
<<
k
<<
std
::
endl
;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
std
::
cout
<<
"beta="
<<
beta
<<
std
::
endl
;
std
::
cout
<<
"Transa="
<<
transa
<<
std
::
endl
;
std
::
cout
<<
"Transb="
<<
transb
<<
std
::
endl
;
sizea
=
lda
*
k
;
sizeb
=
ldb
*
n
;
sizec
=
ldc
*
n
;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
dgemm
(
&
transa
,
&
transb
,
&
m
,
&
n
,
&
k
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
,
&
beta
,
c
,
&
ldc
);
}
std
::
cout
<<
"leaving mkl_offload_dgemm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_dgemm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_dgemv_c
(
char
trans
,
int
m
,
int
n
,
double
alpha
,
double
*
a
,
int
lda
,
double
*
x
,
int
incx
,
double
beta
,
double
*
y
,
int
incy
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_dgemv"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizex
,
sizey
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
std
::
cout
<<
"beta="
<<
beta
<<
std
::
endl
;
std
::
cout
<<
"Trans="
<<
trans
<<
std
::
endl
;
sizea
=
lda
*
n
;
sizex
=
n
;
sizey
=
m
;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
dgemv
(
&
trans
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
x
,
&
incx
,
&
beta
,
y
,
&
incy
);
}
std
::
cout
<<
"leaving mkl_offload_dgemv"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_dgemv without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_dtrmm_c
(
char
side
,
char
uplo
,
char
trans
,
char
diag
,
int
m
,
int
n
,
double
alpha
,
double
*
a
,
int
lda
,
double
*
b
,
int
ldb
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_dtrmm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
if
(
side
==
'L'
||
side
==
'l'
)
{
std
::
cout
<<
"Setting a to case L"
<<
std
::
endl
;
sizea
=
lda
*
m
;
}
if
(
side
==
'R'
||
side
==
'r'
)
{
std
::
cout
<<
"Setting a to case R"
<<
std
::
endl
;
sizea
=
lda
*
n
;
}
sizeb
=
ldb
*
n
;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
dtrmm
(
&
side
,
&
uplo
,
&
trans
,
&
diag
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
);
}
std
::
cout
<<
"leaving mkl_offload_dtrmm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_dtrmm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
#ifdef WANT_SINGLE_PRECISION_REAL
void
mkl_offload_sgemm_c
(
char
transa
,
char
transb
,
int
m
,
int
n
,
int
k
,
float
alpha
,
float
*
a
,
int
lda
,
float
*
b
,
int
ldb
,
float
beta
,
float
*
c
,
int
ldc
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_sgemm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
,
sizec
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
"ldc="
<<
ldc
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
"ldb="
<<
ldb
<<
std
::
endl
;
std
::
cout
<<
"k="
<<
k
<<
std
::
endl
;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
std
::
cout
<<
"beta="
<<
beta
<<
std
::
endl
;
std
::
cout
<<
"Transa="
<<
transa
<<
std
::
endl
;
std
::
cout
<<
"Transb="
<<
transb
<<
std
::
endl
;
sizea
=
lda
*
k
;
sizeb
=
ldb
*
n
;
sizec
=
ldc
*
n
;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
sgemm
(
&
transa
,
&
transb
,
&
m
,
&
n
,
&
k
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
,
&
beta
,
c
,
&
ldc
);
}
std
::
cout
<<
"leaving mkl_offload_sgemm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_sgemm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_sgemv_c
(
char
trans
,
int
m
,
int
n
,
float
alpha
,
float
*
a
,
int
lda
,
float
*
x
,
int
incx
,
float
beta
,
float
*
y
,
int
incy
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_sgemv"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizex
,
sizey
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
std
::
cout
<<
"beta="
<<
beta
<<
std
::
endl
;
std
::
cout
<<
"Trans="
<<
trans
<<
std
::
endl
;
sizea
=
lda
*
n
;
sizex
=
n
;
sizey
=
m
;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
sgemv
(
&
trans
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
x
,
&
incx
,
&
beta
,
y
,
&
incy
);
}
std
::
cout
<<
"leaving mkl_offload_sgemv"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_sgemv without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_strmm_c
(
char
side
,
char
uplo
,
char
trans
,
char
diag
,
int
m
,
int
n
,
float
alpha
,
float
*
a
,
int
lda
,
float
*
b
,
int
ldb
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_strmm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std
::
cout
<<
"alpha="
<<
alpha
<<
std
::
endl
;
if
(
side
==
'L'
||
side
==
'l'
)
{
std
::
cout
<<
"Setting a to case L"
<<
std
::
endl
;
sizea
=
lda
*
m
;
}
if
(
side
==
'R'
||
side
==
'r'
)
{
std
::
cout
<<
"Setting a to case R"
<<
std
::
endl
;
sizea
=
lda
*
n
;
}
sizeb
=
ldb
*
n
;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
strmm
(
&
side
,
&
uplo
,
&
trans
,
&
diag
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
);
}
std
::
cout
<<
"leaving mkl_offload_strmm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_strmm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
#endif
/* WANT_SINGLE_PRECISION_REAL */
void
mkl_offload_zgemm_c
(
char
transa
,
char
transb
,
int
m
,
int
n
,
int
k
,
MKL_Complex16
alpha
,
MKL_Complex16
*
a
,
int
lda
,
MKL_Complex16
*
b
,
int
ldb
,
MKL_Complex16
beta
,
MKL_Complex16
*
c
,
int
ldc
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_zgemm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
,
sizec
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
"ldc="
<<
ldc
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
"ldb="
<<
ldb
<<
std
::
endl
;
std
::
cout
<<
"k="
<<
k
<<
std
::
endl
;
std
::
cout
<<
"Transa="
<<
transa
<<
std
::
endl
;
std
::
cout
<<
"Transb="
<<
transb
<<
std
::
endl
;
sizea
=
lda
*
k
;
sizeb
=
ldb
*
n
;
sizec
=
ldc
*
n
;
#pragma omp target data map(to : a [0:sizea], b [0:sizeb]) map(tofrom : c [0:sizec]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b, c)
zgemm
(
&
transa
,
&
transb
,
&
m
,
&
n
,
&
k
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
,
&
beta
,
c
,
&
ldc
);
}
std
::
cout
<<
"leaving mkl_offload_zgemm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_zgemm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_zgemv_c
(
char
trans
,
int
m
,
int
n
,
MKL_Complex16
alpha
,
MKL_Complex16
*
a
,
int
lda
,
MKL_Complex16
*
x
,
int
incx
,
MKL_Complex16
beta
,
MKL_Complex16
*
y
,
int
incy
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_zgemv"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizex
,
sizey
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
std
::
cout
<<
"Trans="
<<
trans
<<
std
::
endl
;
sizea
=
lda
*
n
;
sizex
=
n
;
sizey
=
m
;
#pragma omp target data map(to : a [0:sizea], x [0:sizex]) map(tofrom : y [0:sizey]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, x, y)
zgemv
(
&
trans
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
x
,
&
incx
,
&
beta
,
y
,
&
incy
);
}
std
::
cout
<<
"leaving mkl_offload_zgemv"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_zgemv without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
void
mkl_offload_ztrmm_c
(
char
side
,
char
uplo
,
char
trans
,
char
diag
,
int
m
,
int
n
,
MKL_Complex16
alpha
,
MKL_Complex16
*
a
,
int
lda
,
MKL_Complex16
*
b
,
int
ldb
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_ztrmm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
std
::
endl
;
//std::cout << "sizeX=" << sizeX << std::endl;
//std::cout << "sizeY=" << sizeY << std::endl;
if
(
side
==
'L'
||
side
==
'l'
)
{
std
::
cout
<<
"Setting a to case L"
<<
std
::
endl
;
sizea
=
lda
*
m
;
}
if
(
side
==
'R'
||
side
==
'r'
)
{
std
::
cout
<<
"Setting a to case R"
<<
std
::
endl
;
sizea
=
lda
*
n
;
}
sizeb
=
ldb
*
n
;
#pragma omp target data map(to : a [0:sizea]) map(tofrom : b [0:sizeb]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a, b)
ztrmm
(
&
side
,
&
uplo
,
&
trans
,
&
diag
,
&
m
,
&
n
,
&
alpha
,
a
,
&
lda
,
b
,
&
ldb
);
}
std
::
cout
<<
"leaving mkl_offload_ztrmm"
<<
std
::
endl
;
#else
std
::
cout
<<
"ERROR: calling mkl_offload_ztrmm without build for Intel GPU support!"
<<
std
::
endl
;
std
::
cout
<<
"ERROR: You should never see this message"
<<
std
::
endl
;
#endif
}
#ifdef WANT_SINGLE_PRECISION_COMPLEX
void
mkl_offload_cgemm_c
(
char
transa
,
char
transb
,
int
m
,
int
n
,
int
k
,
MKL_Complex8
alpha
,
MKL_Complex8
*
a
,
int
lda
,
MKL_Complex8
*
b
,
int
ldb
,
MKL_Complex8
beta
,
MKL_Complex8
*
c
,
int
ldc
)
{
#ifdef WITH_INTEL_GPU_VERSION
std
::
cout
<<
"In mkl_offload_cgemm"
<<
std
::
endl
;
// at a later time the device should be set differently
int
dnum
=
0
;
int
sizea
,
sizeb
,
sizec
;
std
::
cout
<<
"m="
<<
m
<<
"lda="
<<
lda
<<
"ldc="
<<
ldc
<<
std
::
endl
;
std
::
cout
<<
"n="
<<
n
<<
"ldb="
<<
ldb
<<
std
::
endl
;
std
::
cout
<<
"k="
<<
k
<<
std
::
endl
;
std
::
cout
<<
"Transa="
<<
transa
<<
std
::
endl
;
std
::
cout
<<
"Transb="
<<
transb
<<
std
::
endl
;
sizea
=
lda
*
k
;
sizeb
=
ldb
*
n
;
sizec
=
ldc
*
n
;