diff --git a/Makefile.am b/Makefile.am
index e35903d32921a31ec7c1944ab0a958c1eb973a9f..01d55200c4fba8981f2fdc26d085d8cc8323b7a2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -177,6 +177,11 @@ if !HAVE_DETAILED_TIMINGS
endif
endif
+
+if HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ libelpa@SUFFIX@_private_la_SOURCES += src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90
+endif
+
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real.F90
endif
@@ -480,6 +485,7 @@ nobase_elpa_include_HEADERS = \
$(wildcard modules/*) \
src/helpers/lapack_interfaces.h \
src/helpers/scalapack_interfaces.h \
+ elpa/elpa_simd_constants.h \
elpa/elpa.h \
elpa/elpa_generic.h \
elpa/elpa_legacy.h
diff --git a/configure.ac b/configure.ac
index 7767516926647f054288f67f831af23efe8d95e7..98a0703525eabe9d36896d8b44e719dc09928a86 100644
--- a/configure.ac
+++ b/configure.ac
@@ -189,23 +189,24 @@ fi
AX_EXT
-dnl mixed-cluster-support
-AC_MSG_CHECKING(whether mixed-cluster-support should be enabled)
-AC_ARG_ENABLE([mixed-cluster-support],
- AS_HELP_STRING([--mixed-cluster-support],
- [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty!]),
+dnl heterogenous-cluster-support
+AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled)
+AC_ARG_ENABLE([heterogenous-cluster-support],
+ AS_HELP_STRING([--heterogenous-cluster-support],
+ [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]),
[
if test x"$enableval" = x"yes"; then
- enable_mixed_cluster_support=yes
+ enable_heterogenous_cluster_support=yes
else
- enable_mixed_cluster_support=no
+ enable_heterogenous_cluster_support=no
fi
],
- [enable_mixed_cluster_support="no"])
-AC_MSG_RESULT([$enable_mixed_cluster_support])
-if test x"${enable_mixed_cluster_support}" = x"yes"; then
- AC_DEFINE([HAVE_MIXED_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs])
+ [enable_heterogenous_cluster_support="no"])
+AC_MSG_RESULT([$enable_heterogenous_cluster_support])
+if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
+ AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs])
fi
+AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
diff --git a/elpa/elpa_simd_constants.h b/elpa/elpa_simd_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..6049ee71cf9d062e0b9affb1bc364332c2a7c184
--- /dev/null
+++ b/elpa/elpa_simd_constants.h
@@ -0,0 +1,13 @@
+#define CPU_MANUFACTURER 1
+#define GENERIC_INSTR 2
+#define BLUEGENE_INSTR 3
+#define SSE_INSTR 4
+#define AVX_INSTR 5
+#define AVX2_INSTR 6
+#define AVX512_INSTR 7
+#define NVIDIA_INSTR 8
+#define VSX_INSTR 9
+#define ARCH64_INSTR 10
+#define SPARC_INSTR 11
+
+#define NUMBER_OF_INSTR 12
diff --git a/src/elpa2/elpa2.F90 b/src/elpa2/elpa2.F90
index 9b428da76ebf6614064966b08b5a7dff55ca19cc..f31f5063a018535fb872c9444ec041892dc0dfd7 100644
--- a/src/elpa2/elpa2.F90
+++ b/src/elpa2/elpa2.F90
@@ -80,7 +80,7 @@ module elpa2_impl
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
!-------------------------------------------------------------------------------
-!> \brief elpasolve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
+!> \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
diff --git a/src/elpa2/elpa2_template.F90 b/src/elpa2/elpa2_template.F90
index ec96bd6e50ad26160acc2597cba112c605fdd2c1..b59812142805c10de46558c0af0888c3cf304b34 100644
--- a/src/elpa2/elpa2_template.F90
+++ b/src/elpa2/elpa2_template.F90
@@ -49,6 +49,9 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
+
+#include "elpa/elpa_simd_constants.h"
+
function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
@@ -64,7 +67,9 @@
use cuda_functions
use mod_check_for_gpu
use elpa_omp
-
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ use simd_kernel
+#endif
use iso_c_binding
implicit none
#include "../general/precision_kinds.F90"
@@ -74,14 +79,14 @@
logical :: useQR
logical :: useQRActual
#endif
- integer(kind=c_int) :: kernel
+ integer(kind=c_int) :: kernel, kernelByUser
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
- MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
+ MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
- MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
+ MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
@@ -124,6 +129,12 @@
do_trans_to_band, do_trans_to_full
integer(kind=ik) :: nrThreads
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR)
+#endif
+
+
+
#if REALCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
@@ -377,6 +388,88 @@
#endif
+ ! consistency check: is user set kernel still identical with "kernel" or did
+ ! we change it above? This is a mess and should be cleaned up
+ call obj%get(KERNEL_STRING,kernelByUser,error)
+ if (error .ne. ELPA_OK) then
+ print *,"Problem getting option. Aborting..."
+ stop
+ endif
+
+ if (kernelByUser .ne. kernel) then
+ call obj%set(KERNEL_STRING, kernel, error)
+ if (error .ne. ELPA_OK) then
+ print *,"Problem setting option. Aborting..."
+ stop
+ endif
+ endif
+
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ ! find a kernel which is supported on all used CPUs
+ ! at the moment this works only on Intel CPUs
+ simdSetAvailable(:) = 0
+ call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR)
+#ifdef WITH_MPI
+ call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr)
+#endif
+
+ ! compare user chosen kernel with possible kernels
+ call obj%get(KERNEL_STRING,kernelByUser,error)
+ if (error .ne. ELPA_OK) then
+ print *,"Problem getting option. Aborting..."
+ stop
+ endif
+
+ ! map kernel to SIMD Set, and check whether this is set is available on all cores
+
+#if REALCASE == 1
+ if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then
+#endif
+#if COMPLEXCASE == 1
+ if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then
+#endif
+
+ ! if we are not purely running on Intel CPUs, this feature does not work at the moment
+ ! this restriction should be lifted step by step
+ if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then
+ if (my_pe == 0 ) then
+ write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support."
+ write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!"
+ write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now"
+ stop
+ endif
+ else
+ if (my_pe == 0 ) then
+ write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs"
+ write(error_unit,*) "ELPA will use another kernel..."
+ endif
+
+ ! find best kernel available for supported instruction sets
+ do i = NUMBER_OF_INSTR, 2, -1
+ if (simdSetAvailable(i) == 1) then
+ ! map to "best" kernel with this instruction set
+ ! this can be only done for kernels that ELPA has been configured to use
+#if REALCASE == 1
+ kernel = map_simd_instruction_to_real_kernel(i)
+#endif
+#if COMPLEXCASE == 1
+ kernel = map_simd_instruction_to_complex_kernel(i)
+#endif
+ if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then
+ call obj%set(KERNEL_STRING, kernel, error)
+ if (error .ne. ELPA_OK) then
+ print *,"Problem setting option. Aborting..."
+ stop
+ endif
+ if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel)
+ exit
+ endif
+ endif
+ enddo
+ endif
+
+ endif
+#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */
#if REALCASE == 1
call obj%get("qr",qr,error)
diff --git a/src/elpa_impl.F90 b/src/elpa_impl.F90
index dcc0592e77b6c8abc3df2cfcda5684a336b4a594..4643a34435fbf3a6348b7d5e63c983b3b2950dc1 100644
--- a/src/elpa_impl.F90
+++ b/src/elpa_impl.F90
@@ -1799,7 +1799,6 @@ module elpa_impl
#endif
end select
- !print *, "testing, before C call, ts_impl%current is ", ts_impl%current
if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then
@@ -1812,7 +1811,6 @@ module elpa_impl
error = ELPA_ERROR_CANNOT_OPEN_FILE
#endif
endif
- !print *, "testing, after C call, ts_impl%current is ", ts_impl%current
end subroutine
diff --git a/src/helpers/get_cpuid_set.c b/src/helpers/get_cpuid_set.c
new file mode 100644
index 0000000000000000000000000000000000000000..5e204f2ded0ab837bd9a3f48cdbb54fc4fbae3dc
--- /dev/null
+++ b/src/helpers/get_cpuid_set.c
@@ -0,0 +1,221 @@
+// This file is part of ELPA.
+//
+// The ELPA library was originally created by the ELPA consortium,
+// consisting of the following organizations:
+//
+// - Max Planck Computing and Data Facility (MPCDF), formerly known as
+// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+// Informatik,
+// - Technische Universität München, Lehrstuhl für Informatik mit
+// Schwerpunkt Wissenschaftliches Rechnen ,
+// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+// and
+// - IBM Deutschland GmbH
+//
+//
+// This particular source code file contains additions, changes and
+// enhancements authored by Intel Corporation which is not part of
+// the ELPA consortium.
+//
+// More information can be found here:
+// http://elpa.mpcdf.mpg.de/
+//
+// ELPA is free software: you can redistribute it and/or modify
+// it under the terms of the version 3 of the license of the
+// GNU Lesser General Public License as published by the Free
+// Software Foundation.
+//
+// ELPA is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with ELPA. If not, see
+//
+// ELPA reflects a substantial effort on the part of the original
+// ELPA consortium, and we ask you to respect the spirit of the
+// license that we chose: i.e., please contribute any changes you
+// may have back to the original ELPA library distribution, and keep
+// any derivatives of ELPA under the same license that we chose for
+// the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+
+#include "elpa/elpa_simd_constants.h"
+
+static inline void get_cpu_manufacturer(int *set)
+{
+ u_int32_t registers[4];
+ registers[0] = 0;
+ asm volatile("cpuid": "=a" (registers[0]),"=b" (registers[1]),"=c" (registrers[3]),"=d" (registers[2]): "0" (registers[0]), "2" (registers[2]): "memory");
+
+ char str[13]="GenuineIntel\0";
+ char manufacturer[13];
+
+ memcpy(manufacturer, registers[1], 12);
+ manufacturer[12] = '\0';
+
+ if (strcmp(manufacturer, str) == 0) {
+ set[CPU_MANUFACTURER - 1] = 1;
+ } else {
+ set[CPU_MANUFACTURER - 1] = 0;
+ }
+}
+
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+#include
+void cpuid(int info[4], int InfoType){
+ __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+#endif
+
+/*
+!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+!f> interface
+!f> subroutine get_cpuid_set(simdSet, n) &
+!f> bind(C, name="get_cpuid_set")
+!f> use, intrinsic :: iso_c_binding
+!f> integer(kind=c_int), value :: n
+!f> integer(kind=c_int) :: simdSet(n)
+!f> end subroutine
+!f> end interface
+!f>#endif
+*/
+void get_cpuid_set(int *set, int nlength){
+
+ get_cpu_manufacturer(set);
+
+
+ // Code below taken from http://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set/7495023#7495023
+
+ // Misc.
+ bool HW_MMX;
+ bool HW_x64;
+ bool HW_ABM; // Advanced Bit Manipulation
+ bool HW_RDRAND;
+ bool HW_BMI1;
+ bool HW_BMI2;
+ bool HW_ADX;
+ bool HW_PREFETCHWT1;
+
+ // SIMD: 128-bit
+ bool HW_SSE;
+ bool HW_SSE2;
+ bool HW_SSE3;
+ bool HW_SSSE3;
+ bool HW_SSE41;
+ bool HW_SSE42;
+ bool HW_SSE4a;
+ bool HW_AES;
+ bool HW_SHA;
+
+ // SIMD: 256-bit
+ bool HW_AVX;
+ bool HW_XOP;
+ bool HW_FMA3;
+ bool HW_FMA4;
+ bool HW_AVX2;
+ // SIMD: 512-bit
+ bool HW_AVX512F; // AVX512 Foundation
+ bool HW_AVX512CD; // AVX512 Conflict Detection
+ bool HW_AVX512PF; // AVX512 Prefetch
+ bool HW_AVX512ER; // AVX512 Exponential + Reciprocal
+ bool HW_AVX512VL; // AVX512 Vector Length Extensions
+ bool HW_AVX512BW; // AVX512 Byte + Word
+ bool HW_AVX512DQ; // AVX512 Doubleword + Quadword
+ bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add
+ bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions
+
+ int info[4];
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+
+ cpuid(info, 0);
+ int nIds = info[0];
+
+ cpuid(info, 0x80000000);
+ unsigned nExIds = info[0];
+#endif
+ // Detect Features
+ if (nIds >= 0x00000001){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ cpuid(info,0x00000001);
+#endif
+ HW_MMX = (info[3] & ((int)1 << 23)) != 0;
+ HW_SSE = (info[3] & ((int)1 << 25)) != 0;
+ HW_SSE2 = (info[3] & ((int)1 << 26)) != 0;
+ HW_SSE3 = (info[2] & ((int)1 << 0)) != 0;
+
+ HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0;
+ HW_SSE41 = (info[2] & ((int)1 << 19)) != 0;
+ HW_SSE42 = (info[2] & ((int)1 << 20)) != 0;
+ HW_AES = (info[2] & ((int)1 << 25)) != 0;
+
+ HW_AVX = (info[2] & ((int)1 << 28)) != 0;
+ HW_FMA3 = (info[2] & ((int)1 << 12)) != 0;
+ HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
+ }
+ if (nIds >= 0x00000007){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ cpuid(info,0x00000007);
+#endif
+ HW_AVX2 = (info[1] & ((int)1 << 5)) != 0;
+
+ HW_BMI1 = (info[1] & ((int)1 << 3)) != 0;
+ HW_BMI2 = (info[1] & ((int)1 << 8)) != 0;
+ HW_ADX = (info[1] & ((int)1 << 19)) != 0;
+ HW_SHA = (info[1] & ((int)1 << 29)) != 0;
+ HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
+
+ HW_AVX512F = (info[1] & ((int)1 << 16)) != 0;
+ HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0;
+ HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0;
+ HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0;
+ HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0;
+ HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0;
+ HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0;
+ HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0;
+ HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0;
+ }
+
+ if (nExIds >= 0x80000001){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+ cpuid(info,0x80000001);
+#endif
+ HW_x64 = (info[3] & ((int)1 << 29)) != 0;
+ HW_ABM = (info[2] & ((int)1 << 5)) != 0;
+ HW_SSE4a = (info[2] & ((int)1 << 6)) != 0;
+ HW_FMA4 = (info[2] & ((int)1 << 16)) != 0;
+ HW_XOP = (info[2] & ((int)1 << 11)) != 0;
+ }
+
+ //allways allow GENERIC
+ set[GENERIC_INSTR -1] =1;
+
+ // the rest depends on the CPU
+ if (HW_SSE42) {
+ set[SSE_INSTR - 1] = 1;
+ }
+ if (HW_AVX) {
+ set[AVX_INSTR - 1] = 1;
+ }
+ if (HW_AVX2) {
+ set[AVX2_INSTR - 1] = 1;
+ }
+ if (HW_AVX512F) {
+ set[AVX512_INSTR -1] = 1;
+ }
+
+}
+
+
diff --git a/src/helpers/mod_simd_kernel.F90 b/src/helpers/mod_simd_kernel.F90
new file mode 100644
index 0000000000000000000000000000000000000000..5d956b8cf623f1631c66b43da35c34b6f35457c3
--- /dev/null
+++ b/src/helpers/mod_simd_kernel.F90
@@ -0,0 +1,171 @@
+! This file is part of ELPA.
+!
+! The ELPA library was originally created by the ELPA consortium,
+! consisting of the following organizations:
+!
+! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+! Informatik,
+! - Technische Universität München, Lehrstuhl für Informatik mit
+! Schwerpunkt Wissenschaftliches Rechnen ,
+! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+! and
+! - IBM Deutschland GmbH
+!
+!
+! More information can be found here:
+! http://elpa.rzg.mpg.de/
+!
+! ELPA is free software: you can redistribute it and/or modify
+! it under the terms of the version 3 of the license of the
+! GNU Lesser General Public License as published by the Free
+! Software Foundation.
+!
+! ELPA is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+! GNU Lesser General Public License for more details.
+!
+! You should have received a copy of the GNU Lesser General Public License
+! along with ELPA. If not, see
+!
+! ELPA reflects a substantial effort on the part of the original
+! ELPA consortium, and we ask you to respect the spirit of the
+! license that we chose: i.e., please contribute any changes you
+! may have back to the original ELPA library distribution, and keep
+! any derivatives of ELPA under the same license that we chose for
+! the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+#include "elpa/elpa_simd_constants.h"
+
+module simd_kernel
+ use elpa_constants
+ use iso_c_binding
+
+ integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS)
+ integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR)
+ integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS)
+ integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR)
+
+ contains
+
+ function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index)
+
+ use iso_c_binding
+ implicit none
+
+ integer(kind=c_int), intent(in) :: kernel
+ integer(kind=c_int) :: simd_set_index
+
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC) = GENERIC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE) = GENERIC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP) = BLUEGENE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ) = BLUEGENE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY) = SSE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2) = SSE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4) = SSE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6) = SSE_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2) = AVX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4) = AVX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6) = AVX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6) = AVX2_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) = ARCH64_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) = ARCH64_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) = ARCH64_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2) = VSX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4) = VSX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6) = VSX_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR
+ realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR
+
+ simd_set_index = realKernels_to_simdTable(kernel)
+
+
+ end
+
+ function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel)
+
+ use iso_c_binding
+ implicit none
+
+
+ integer(kind=c_int) :: kernel
+ integer(kind=c_int), intent(in) :: simd_set_index
+
+ simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC
+ simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
+ simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2
+ simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
+ simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
+ simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
+ simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU
+ simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
+ simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
+ simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
+
+ kernel = simdTable_to_realKernels(simd_set_index)
+
+ end
+
+ function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index)
+
+ use iso_c_binding
+ implicit none
+ integer(kind=c_int), intent(in) :: kernel
+ integer(kind=c_int) :: simd_set_index
+
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC) = GENERIC_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP) = BLUEGENE_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ) = BLUEGENE_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) = SSE_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1) = SSE_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2) = SSE_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1) = AVX_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2) = AVX_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1) = AVX2_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) = AVX2_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1) = AVX512_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) = AVX512_INSTR
+ complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR
+
+
+ simd_set_index = complexKernels_to_simdTable(kernel)
+
+ end
+
+ function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel)
+
+ use iso_c_binding
+ implicit none
+ integer(kind=c_int) :: kernel
+ integer(kind=c_int), intent(in) :: simd_set_index
+
+ simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
+ simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
+ simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
+ simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
+ simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
+ simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
+ simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU
+
+ kernel = simdTable_to_complexKernels(simd_set_index)
+
+ end
+
+end module
+
diff --git a/src/helpers/print_build_config.c b/src/helpers/print_build_config.c
index a2285a75556378f5e15ed4255979fb52c2d29961..c7f9a62d9a61e63bc6d82999d24ee05ea538984e 100644
--- a/src/helpers/print_build_config.c
+++ b/src/helpers/print_build_config.c
@@ -1,3 +1,51 @@
+// This file is part of ELPA.
+//
+// The ELPA library was originally created by the ELPA consortium,
+// consisting of the following organizations:
+//
+// - Max Planck Computing and Data Facility (MPCDF), formerly known as
+// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+// Informatik,
+// - Technische Universität München, Lehrstuhl für Informatik mit
+// Schwerpunkt Wissenschaftliches Rechnen ,
+// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+// and
+// - IBM Deutschland GmbH
+//
+//
+// This particular source code file contains additions, changes and
+// enhancements authored by Intel Corporation which is not part of
+// the ELPA consortium.
+//
+// More information can be found here:
+// http://elpa.mpcdf.mpg.de/
+//
+// ELPA is free software: you can redistribute it and/or modify
+// it under the terms of the version 3 of the license of the
+// GNU Lesser General Public License as published by the Free
+// Software Foundation.
+//
+// ELPA is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with ELPA. If not, see
+//
+// ELPA reflects a substantial effort on the part of the original
+// ELPA consortium, and we ask you to respect the spirit of the
+// license that we chose: i.e., please contribute any changes you
+// may have back to the original ELPA library distribution, and keep
+// any derivatives of ELPA under the same license that we chose for
+// the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+
#include "config.h"
#include "elpa/elpa_build_config.h"
#include