Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
60bf252d
Commit
60bf252d
authored
Dec 07, 2020
by
Andreas Marek
Browse files
Experimental feature: complex neon kernels
parent
110737de
Changes
11
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
60bf252d
...
...
@@ -429,6 +429,13 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if
WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_neon_arch64_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_neon_arch64_1hv_single_precision.c
endif
endif
if
WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_avx_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -492,6 +499,13 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if
WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_neon_arch64_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_neon_arch64_2hv_single_precision.c
endif
endif
if
WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_avx_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
...
...
configure.ac
View file @
60bf252d
...
...
@@ -793,6 +793,8 @@ m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
complex_neon_arch64_block1
complex_neon_arch64_block2
])
m4_define(elpa_m4_vsx_kernels, [
...
...
elpa/elpa_constants.h.in
View file @
60bf252d
...
...
@@ -102,7 +102,9 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK2, 17, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 18, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 20, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
elpa/elpa_simd_constants.h
View file @
60bf252d
...
...
@@ -9,6 +9,8 @@
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define SVE512_INSTR 12
#define SVE128_INSTR 12
#define SVE256_INSTR 13
#define SVE512_INSTR 14
#define NUMBER_OF_INSTR 1
3
#define NUMBER_OF_INSTR 1
5
src/elpa2/compute_hh_trafo.F90
View file @
60bf252d
...
...
@@ -709,6 +709,36 @@ kernel)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
! neon_arch64 block1 complex kernel
#if defined(WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL))
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP_TRADITIONAL
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL */
! sve128 block1 complex kernel
#if defined(WITH_COMPLEX_SVE128_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
...
...
@@ -732,12 +762,12 @@ kernel)
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_S
SE
_BLOCK2_KERNEL)) */
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_S
VE128
_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_S
SE
_BLOCK1)
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_S
VE128
_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_S
SE
_BLOCK1_KERNEL */
#endif /* WITH_COMPLEX_S
VE128
_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
...
...
@@ -1223,6 +1253,50 @@ kernel)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
! implementation of neon_arch64 block 2 complex case
#if defined(WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP_TRADITIONAL
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL */
! implementation of sve128 block 2 complex case
#if defined(WITH_COMPLEX_SVE128_BLOCK2_KERNEL)
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
60bf252d
...
...
@@ -825,7 +825,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
...
...
@@ -841,6 +840,36 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_NEON_ARCH64_SSE
!f> interface
!f> subroutine single_hh_trafo_complex_NEON_ARCH64_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_NEON_ARCH64_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_NEON_ARCH64_SSE
!f> interface
!f> subroutine single_hh_trafo_complex_NEON_ARCH64_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_NEON_ARCH64_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
...
...
@@ -1097,6 +1126,36 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_NEON_ARCH64_SSE
!f> interface
!f> subroutine double_hh_trafo_complex_NEON_ARCH64_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_NEON_ARCH64_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_NEON_ARCH64_SSE
!f> interface
!f> subroutine double_hh_trafo_complex_NEON_ARCH64_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_NEON_ARCH64_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SVE128
!f> interface
...
...
src/elpa2/kernels/complex_neon_arch64_1hv_double_precision.c
0 → 100644
View file @
60bf252d
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include
"config-f90.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET NEON_ARCH64_128
#include
"../../general/precision_macros.h"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_neon_arch64_1hv_single_precision.c
0 → 100644
View file @
60bf252d
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include
"config-f90.h"
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET NEON_ARCH64_128
#include
"../../general/precision_macros.h"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef COMPLEXCASE
#undef SINGLE_PRECISION
src/elpa2/kernels/complex_neon_arch64_2hv_double_precision.c
0 → 100644
View file @
60bf252d
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include
"config-f90.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET NEON_ARCH64_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_neon_arch64_2hv_single_precision.c
0 → 100644
View file @
60bf252d
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include
"config-f90.h"
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define VEC_SET NEON_ARCH64_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
src/helpers/mod_simd_kernel.F90
View file @
60bf252d
...
...
@@ -142,26 +142,28 @@ module simd_kernel
integer
(
kind
=
c_int
),
intent
(
in
)
::
kernel
integer
(
kind
=
c_int
)
::
simd_set_index
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GENERIC
)
=
GENERIC_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE
)
=
GENERIC_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_BGP
)
=
BLUEGENE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_BGQ
)
=
BLUEGENE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_BLOCK1
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_BLOCK2
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX_BLOCK1
)
=
AVX_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX_BLOCK2
)
=
AVX_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
)
=
AVX2_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX2_BLOCK2
)
=
AVX2_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
)
=
AVX512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
=
AVX512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE128_BLOCK1
)
=
SVE128_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE128_BLOCK2
)
=
SVE128_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
)
=
SVE256_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE256_BLOCK2
)
=
SVE256_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
)
=
SVE512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
)
=
SVE512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GPU
)
=
NVIDIA_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GENERIC
)
=
GENERIC_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE
)
=
GENERIC_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_BGP
)
=
BLUEGENE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_BGQ
)
=
BLUEGENE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_BLOCK1
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SSE_BLOCK2
)
=
SSE_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX_BLOCK1
)
=
AVX_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX_BLOCK2
)
=
AVX_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
)
=
AVX2_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX2_BLOCK2
)
=
AVX2_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
)
=
AVX512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
=
AVX512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE128_BLOCK1
)
=
SVE128_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE128_BLOCK2
)
=
SVE128_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
)
=
SVE256_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE256_BLOCK2
)
=
SVE256_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
)
=
SVE512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
)
=
SVE512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
)
=
ARCH64_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2
)
=
ARCH64_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_GPU
)
=
NVIDIA_INSTR
simd_set_index
=
complexKernels_to_simdTable
(
kernel
)
...
...
@@ -184,6 +186,7 @@ module simd_kernel
simdTable_to_complexKernels
(
SVE128_INSTR
)
=
ELPA_2STAGE_COMPLEX_SVE128_BLOCK1
simdTable_to_complexKernels
(
SVE256_INSTR
)
=
ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
simdTable_to_complexKernels
(
SVE512_INSTR
)
=
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
simdTable_to_complexKernels
(
ARCH64_INSTR
)
=
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
simdTable_to_complexKernels
(
NVIDIA_INSTR
)
=
ELPA_2STAGE_COMPLEX_GPU
kernel
=
simdTable_to_complexKernels
(
simd_set_index
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment