...
 
Commits (2)
  • Andreas Marek's avatar
    Add mixed-cluster option to configure · 973d0296
    Andreas Marek authored
    973d0296
  • Andreas Marek's avatar
    HETEROGENOUS_CLUSTER support · 5b3324ff
    Andreas Marek authored
    On heterogenous cluster, of nodes with different CPUs the _experimental_
    feature (--enable-heterogenous-cluster-support) can be used:
    
    It compares the (Intel) cpuid set of all CPUs which are used by ELPA MPI
    processes and finds the SIMD instruction set, which is supported by all
    used CPUs. The ELPA 2stage back-transformation kernel (a.k.a "kernel")
    will be set accordingly on all MPI processes.
    
    This feature, can override the setting of the kernel done previously by
    the user!
    
    At the moment it will only work for Intel CPUs, i.e. clusters consisting
    of nodes with Intel CPUs and e.g. AMD CPUs are at the moment _NOT_
    supported.
    
    Since this is an experimental feature, it might be dropped again in the
    future, if it turns out not to be useful for the users
    5b3324ff
......@@ -177,6 +177,11 @@ if !HAVE_DETAILED_TIMINGS
endif
endif
if HAVE_HETEROGENOUS_CLUSTER_SUPPORT
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real.F90
endif
......@@ -480,6 +485,7 @@ nobase_elpa_include_HEADERS = \
$(wildcard modules/*) \
src/helpers/lapack_interfaces.h \
src/helpers/scalapack_interfaces.h \
elpa/elpa_simd_constants.h \
elpa/elpa.h \
elpa/elpa_generic.h \
elpa/elpa_legacy.h
......
......@@ -189,6 +189,24 @@ fi
AX_EXT
dnl heterogenous-cluster-support
AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled)
AC_ARG_ENABLE([heterogenous-cluster-support],
AS_HELP_STRING([--heterogenous-cluster-support],
[allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]),
[
if test x"$enableval" = x"yes"; then
enable_heterogenous_cluster_support=yes
else
enable_heterogenous_cluster_support=no
fi
],
[enable_heterogenous_cluster_support="no"])
AC_MSG_RESULT([$enable_heterogenous_cluster_support])
if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs])
fi
AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
#define CPU_MANUFACTURER 1
#define GENERIC_INSTR 2
#define BLUEGENE_INSTR 3
#define SSE_INSTR 4
#define AVX_INSTR 5
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define NUMBER_OF_INSTR 12
......@@ -80,7 +80,7 @@ module elpa2_impl
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
!-------------------------------------------------------------------------------
!> \brief elpasolve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!> \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
......
......@@ -49,6 +49,9 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "elpa/elpa_simd_constants.h"
function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
......@@ -64,7 +67,9 @@
use cuda_functions
use mod_check_for_gpu
use elpa_omp
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
use simd_kernel
#endif
use iso_c_binding
implicit none
#include "../general/precision_kinds.F90"
......@@ -74,14 +79,14 @@
logical :: useQR
logical :: useQRActual
#endif
integer(kind=c_int) :: kernel
integer(kind=c_int) :: kernel, kernelByUser
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
......@@ -124,6 +129,12 @@
do_trans_to_band, do_trans_to_full
integer(kind=ik) :: nrThreads
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR),simdSetRequested(NUMBER_OF_INSTR)
#endif
#if REALCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
......@@ -377,6 +388,87 @@
#endif
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
call obj%get(KERNEL_STRING,kernelByUser,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
if (kernelByUser .ne. kernel) then
call obj%set(KERNEL_STRING, kernel, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
endif
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
! find a kernel which is supported on all used CPUs
! at the moment this works only on Intel CPUs
simdSetAvailable(:) = 0
call get_cpuid_set(simdSetAvailable, size(simdSetAvailable))
#ifdef WITH_MPI
call MPI_ALLREDUCE(simdSetAvailable, MPI_IN_PLACE, size(simdSetAvailable), MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr)
#endif
! compare user chosen kernel with possible kernels
call obj%get(KERNEL_STRING,kernelByUser,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
! map kernel to SIMD Set, and check whether this is set is available on all cores
#if REALCASE == 1
if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then
#endif
#if COMPLEXCASE == 1
if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then
#endif
! if we are not purely running on Intel CPUs, this feature does not work at the moment
! this restriction should be lifted step by step
if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then
if (my_pe == 0 ) then
write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support."
write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!"
write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now"
stop
endif
else
if (my_pe == 0 ) then
write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs"
write(error_unit,*) "ELPA will use another kernel..."
endif
! find best kernel available for supported instruction sets
do i = NUMBER_OF_INSTR, 2, -1
if (simdSetAvailable(i) == 1) then
! map to "best" kernel with this instruction set
! this can be only done for kernels that ELPA has been configured to use
#if REALCASE == 1
kernel = map_simd_instruction_to_real_kernel(i)
#endif
#if COMPLEXCASE == 1
kernel = map_simd_instruction_to_complex_kernel(i)
#endif
if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then
call obj%set(KERNEL_STRING, kernel, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel)
endif
endif
enddo
endif
endif
#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */
#if REALCASE == 1
call obj%get("qr",qr,error)
......
......@@ -1799,7 +1799,6 @@ module elpa_impl
#endif
end select
!print *, "testing, before C call, ts_impl%current is ", ts_impl%current
if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then
......@@ -1812,7 +1811,6 @@ module elpa_impl
error = ELPA_ERROR_CANNOT_OPEN_FILE
#endif
endif
!print *, "testing, after C call, ts_impl%current is ", ts_impl%current
end subroutine
......
#include "config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include "elpa/elpa_simd_constants.h"
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
asm volatile("cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (*eax), "2" (*ecx)
: "memory");
}
// GCC Intrinsics
#include <cpuid.h>
void cpuid(int info[4], int InfoType){
__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
}
/*
!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
!f> interface
!f> subroutine get_cpuid_set(simdSet, n) &
!f> bind(C, name="get_cpuid_set")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int), value :: n
!f> integer(kind=c_int) :: simdSet(n)
!f> end subroutine
!f> end interface
!f>#endif
*/
void get_cpuid_set(int *set, int nlength){
int eax, ebx, ecx, edx;
char str[13]="GenuineIntel\0";
eax = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
//printf("EAX: %d EBX: %08X ECX: %08X EDX: %08X\n", eax, ebx, ecx, edx);
char vendor[13];
memcpy(vendor, &ebx, 4);
memcpy(vendor+4, &edx, 4);
memcpy(vendor+8, &ecx, 4);
vendor[12] = '\0';
if (strcmp(vendor, str) == 0) {
set[CPU_MANUFACTURER - 1] = 1;
} else {
set[CPU_MANUFACTURER - 1] = 0;
}
// Misc.
bool HW_MMX;
bool HW_x64;
bool HW_ABM; // Advanced Bit Manipulation
bool HW_RDRAND;
bool HW_BMI1;
bool HW_BMI2;
bool HW_ADX;
bool HW_PREFETCHWT1;
// SIMD: 128-bit
bool HW_SSE;
bool HW_SSE2;
bool HW_SSE3;
bool HW_SSSE3;
bool HW_SSE41;
bool HW_SSE42;
bool HW_SSE4a;
bool HW_AES;
bool HW_SHA;
// SIMD: 256-bit
bool HW_AVX;
bool HW_XOP;
bool HW_FMA3;
bool HW_FMA4;
bool HW_AVX2;
// SIMD: 512-bit
bool HW_AVX512F; // AVX512 Foundation
bool HW_AVX512CD; // AVX512 Conflict Detection
bool HW_AVX512PF; // AVX512 Prefetch
bool HW_AVX512ER; // AVX512 Exponential + Reciprocal
bool HW_AVX512VL; // AVX512 Vector Length Extensions
bool HW_AVX512BW; // AVX512 Byte + Word
bool HW_AVX512DQ; // AVX512 Doubleword + Quadword
bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add
bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions
int info[4];
cpuid(info, 0);
int nIds = info[0];
cpuid(info, 0x80000000);
unsigned nExIds = info[0];
// Detect Features
if (nIds >= 0x00000001){
cpuid(info,0x00000001);
HW_MMX = (info[3] & ((int)1 << 23)) != 0;
HW_SSE = (info[3] & ((int)1 << 25)) != 0;
HW_SSE2 = (info[3] & ((int)1 << 26)) != 0;
HW_SSE3 = (info[2] & ((int)1 << 0)) != 0;
HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0;
HW_SSE41 = (info[2] & ((int)1 << 19)) != 0;
HW_SSE42 = (info[2] & ((int)1 << 20)) != 0;
HW_AES = (info[2] & ((int)1 << 25)) != 0;
HW_AVX = (info[2] & ((int)1 << 28)) != 0;
HW_FMA3 = (info[2] & ((int)1 << 12)) != 0;
HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
}
if (nIds >= 0x00000007){
cpuid(info,0x00000007);
HW_AVX2 = (info[1] & ((int)1 << 5)) != 0;
HW_BMI1 = (info[1] & ((int)1 << 3)) != 0;
HW_BMI2 = (info[1] & ((int)1 << 8)) != 0;
HW_ADX = (info[1] & ((int)1 << 19)) != 0;
HW_SHA = (info[1] & ((int)1 << 29)) != 0;
HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
HW_AVX512F = (info[1] & ((int)1 << 16)) != 0;
HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0;
HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0;
HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0;
HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0;
HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0;
HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0;
HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0;
HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0;
}
if (nExIds >= 0x80000001){
cpuid(info,0x80000001);
HW_x64 = (info[3] & ((int)1 << 29)) != 0;
HW_ABM = (info[2] & ((int)1 << 5)) != 0;
HW_SSE4a = (info[2] & ((int)1 << 6)) != 0;
HW_FMA4 = (info[2] & ((int)1 << 16)) != 0;
HW_XOP = (info[2] & ((int)1 << 11)) != 0;
}
//allways allow GENERIC
set[GENERIC_INSTR -1] =1;
// the rest depends on the CPU
if (HW_SSE42) {
set[SSE_INSTR - 1] = 1;
}
if (HW_AVX) {
set[AVX_INSTR - 1] = 1;
}
if (HW_AVX2) {
set[AVX2_INSTR - 1] = 1;
}
if (HW_AVX512F) {
set[AVX512_INSTR -1] = 1;
}
//
//
//printf("SSE4: %d \n",HW_SSE42);
//printf("AVX: %d \n",HW_AVX);
//printf("AVX2: %d \n",HW_AVX2);
//printf("AVX512F: %d \n",HW_AVX512F);
//printf("AVX512CD: %d \n",HW_AVX512CD);
//printf("AVX512PF: %d \n",HW_AVX512PF);
//printf("AVX512ER: %d \n",HW_AVX512ER);
//printf("AVX512VL: %d \n",HW_AVX512VL);
// int i;
//printf("Length of array: %d \n");
// for (i=0;i<nlength;i++){
// printf("Entry %d = %d \n",i,set[i]);
// set[i] = set[i] + 1;
// }
//printf("Length of array: %d \n");
}
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#include "config-f90.h"
#include "elpa/elpa_simd_constants.h"
module simd_kernel
use elpa_constants
use iso_c_binding
integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS)
integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR)
integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS)
integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR)
contains
function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index)
use iso_c_binding
implicit none
integer(kind=c_int), intent(in) :: kernel
integer(kind=c_int) :: simd_set_index
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP) = BLUEGENE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ) = BLUEGENE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR
simd_set_index = realKernels_to_simdTable(kernel)
end
function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel)
use iso_c_binding
implicit none
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC
simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2
simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU
simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
kernel = simdTable_to_realKernels(simd_set_index)
end
function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index)
use iso_c_binding
implicit none
integer(kind=c_int), intent(in) :: kernel
integer(kind=c_int) :: simd_set_index
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC) = GENERIC_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP) = BLUEGENE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ) = BLUEGENE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1) = AVX_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2) = AVX_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1) = AVX2_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) = AVX2_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1) = AVX512_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) = AVX512_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR
simd_set_index = complexKernels_to_simdTable(kernel)
end
function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel)
use iso_c_binding
implicit none
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU
kernel = simdTable_to_complexKernels(simd_set_index)
end
end module