diff --git a/Makefile.am b/Makefile.am index e35903d32921a31ec7c1944ab0a958c1eb973a9f..01d55200c4fba8981f2fdc26d085d8cc8323b7a2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -177,6 +177,11 @@ if !HAVE_DETAILED_TIMINGS endif endif + +if HAVE_HETEROGENOUS_CLUSTER_SUPPORT + libelpa@SUFFIX@_private_la_SOURCES += src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90 +endif + if WITH_REAL_GENERIC_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real.F90 endif @@ -480,6 +485,7 @@ nobase_elpa_include_HEADERS = \ $(wildcard modules/*) \ src/helpers/lapack_interfaces.h \ src/helpers/scalapack_interfaces.h \ + elpa/elpa_simd_constants.h \ elpa/elpa.h \ elpa/elpa_generic.h \ elpa/elpa_legacy.h diff --git a/configure.ac b/configure.ac index 7767516926647f054288f67f831af23efe8d95e7..98a0703525eabe9d36896d8b44e719dc09928a86 100644 --- a/configure.ac +++ b/configure.ac @@ -189,23 +189,24 @@ fi AX_EXT -dnl mixed-cluster-support -AC_MSG_CHECKING(whether mixed-cluster-support should be enabled) -AC_ARG_ENABLE([mixed-cluster-support], - AS_HELP_STRING([--mixed-cluster-support], - [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty!]), +dnl heterogenous-cluster-support +AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled) +AC_ARG_ENABLE([heterogenous-cluster-support], + AS_HELP_STRING([--heterogenous-cluster-support], + [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]), [ if test x"$enableval" = x"yes"; then - enable_mixed_cluster_support=yes + enable_heterogenous_cluster_support=yes else - enable_mixed_cluster_support=no + enable_heterogenous_cluster_support=no fi ], - [enable_mixed_cluster_support="no"]) -AC_MSG_RESULT([$enable_mixed_cluster_support]) -if test x"${enable_mixed_cluster_support}" = x"yes"; then - AC_DEFINE([HAVE_MIXED_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs]) + [enable_heterogenous_cluster_support="no"]) +AC_MSG_RESULT([$enable_heterogenous_cluster_support]) +if test x"${enable_heterogenous_cluster_support}" = x"yes"; then + AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs]) fi +AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"]) AC_MSG_CHECKING(whether C compiler can use _Generic ) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ diff --git a/elpa/elpa_simd_constants.h b/elpa/elpa_simd_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..6049ee71cf9d062e0b9affb1bc364332c2a7c184 --- /dev/null +++ b/elpa/elpa_simd_constants.h @@ -0,0 +1,13 @@ +#define CPU_MANUFACTURER 1 +#define GENERIC_INSTR 2 +#define BLUEGENE_INSTR 3 +#define SSE_INSTR 4 +#define AVX_INSTR 5 +#define AVX2_INSTR 6 +#define AVX512_INSTR 7 +#define NVIDIA_INSTR 8 +#define VSX_INSTR 9 +#define ARCH64_INSTR 10 +#define SPARC_INSTR 11 + +#define NUMBER_OF_INSTR 12 diff --git a/src/elpa2/elpa2.F90 b/src/elpa2/elpa2.F90 index 9b428da76ebf6614064966b08b5a7dff55ca19cc..f31f5063a018535fb872c9444ec041892dc0dfd7 100644 --- a/src/elpa2/elpa2.F90 +++ b/src/elpa2/elpa2.F90 @@ -80,7 +80,7 @@ module elpa2_impl #define DOUBLE_PRECISION 1 #include "../general/precision_macros.h" !------------------------------------------------------------------------------- -!> \brief elpasolve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach +!> \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach !> !> Parameters !> diff --git a/src/elpa2/elpa2_template.F90 b/src/elpa2/elpa2_template.F90 index ec96bd6e50ad26160acc2597cba112c605fdd2c1..b59812142805c10de46558c0af0888c3cf304b34 100644 --- a/src/elpa2/elpa2_template.F90 +++ b/src/elpa2/elpa2_template.F90 @@ -49,6 +49,9 @@ ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". + +#include "elpa/elpa_simd_constants.h" + function elpa_solve_evp_& &MATH_DATATYPE& &_& @@ -64,7 +67,9 @@ use cuda_functions use mod_check_for_gpu use elpa_omp - +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + use simd_kernel +#endif use iso_c_binding implicit none #include "../general/precision_kinds.F90" @@ -74,14 +79,14 @@ logical :: useQR logical :: useQRActual #endif - integer(kind=c_int) :: kernel + integer(kind=c_int) :: kernel, kernelByUser #ifdef USE_ASSUMED_SIZE MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*) - MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*) + MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*) #else MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols) - MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols) + MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols) #endif real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na) MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:) @@ -124,6 +129,12 @@ do_trans_to_band, do_trans_to_full integer(kind=ik) :: nrThreads +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR) +#endif + + + #if REALCASE == 1 #undef GPU_KERNEL #undef GENERIC_KERNEL @@ -377,6 +388,88 @@ #endif + ! consistency check: is user set kernel still identical with "kernel" or did + ! we change it above? This is a mess and should be cleaned up + call obj%get(KERNEL_STRING,kernelByUser,error) + if (error .ne. ELPA_OK) then + print *,"Problem getting option. Aborting..." + stop + endif + + if (kernelByUser .ne. kernel) then + call obj%set(KERNEL_STRING, kernel, error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + endif + +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + ! find a kernel which is supported on all used CPUs + ! at the moment this works only on Intel CPUs + simdSetAvailable(:) = 0 + call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR) +#ifdef WITH_MPI + call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr) +#endif + + ! compare user chosen kernel with possible kernels + call obj%get(KERNEL_STRING,kernelByUser,error) + if (error .ne. ELPA_OK) then + print *,"Problem getting option. Aborting..." + stop + endif + + ! map kernel to SIMD Set, and check whether this is set is available on all cores + +#if REALCASE == 1 + if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then +#endif +#if COMPLEXCASE == 1 + if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then +#endif + + ! if we are not purely running on Intel CPUs, this feature does not work at the moment + ! this restriction should be lifted step by step + if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then + if (my_pe == 0 ) then + write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support." + write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!" + write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now" + stop + endif + else + if (my_pe == 0 ) then + write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs" + write(error_unit,*) "ELPA will use another kernel..." + endif + + ! find best kernel available for supported instruction sets + do i = NUMBER_OF_INSTR, 2, -1 + if (simdSetAvailable(i) == 1) then + ! map to "best" kernel with this instruction set + ! this can be only done for kernels that ELPA has been configured to use +#if REALCASE == 1 + kernel = map_simd_instruction_to_real_kernel(i) +#endif +#if COMPLEXCASE == 1 + kernel = map_simd_instruction_to_complex_kernel(i) +#endif + if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then + call obj%set(KERNEL_STRING, kernel, error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel) + exit + endif + endif + enddo + endif + + endif +#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */ #if REALCASE == 1 call obj%get("qr",qr,error) diff --git a/src/elpa_impl.F90 b/src/elpa_impl.F90 index dcc0592e77b6c8abc3df2cfcda5684a336b4a594..4643a34435fbf3a6348b7d5e63c983b3b2950dc1 100644 --- a/src/elpa_impl.F90 +++ b/src/elpa_impl.F90 @@ -1799,7 +1799,6 @@ module elpa_impl #endif end select - !print *, "testing, before C call, ts_impl%current is ", ts_impl%current if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, & ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then @@ -1812,7 +1811,6 @@ module elpa_impl error = ELPA_ERROR_CANNOT_OPEN_FILE #endif endif - !print *, "testing, after C call, ts_impl%current is ", ts_impl%current end subroutine diff --git a/src/helpers/get_cpuid_set.c b/src/helpers/get_cpuid_set.c new file mode 100644 index 0000000000000000000000000000000000000000..5e204f2ded0ab837bd9a3f48cdbb54fc4fbae3dc --- /dev/null +++ b/src/helpers/get_cpuid_set.c @@ -0,0 +1,221 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config.h" + +#include +#include +#include +#include + +#include "elpa/elpa_simd_constants.h" + +static inline void get_cpu_manufacturer(int *set) +{ + u_int32_t registers[4]; + registers[0] = 0; + asm volatile("cpuid": "=a" (registers[0]),"=b" (registers[1]),"=c" (registrers[3]),"=d" (registers[2]): "0" (registers[0]), "2" (registers[2]): "memory"); + + char str[13]="GenuineIntel\0"; + char manufacturer[13]; + + memcpy(manufacturer, registers[1], 12); + manufacturer[12] = '\0'; + + if (strcmp(manufacturer, str) == 0) { + set[CPU_MANUFACTURER - 1] = 1; + } else { + set[CPU_MANUFACTURER - 1] = 0; + } +} + +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT +#include +void cpuid(int info[4], int InfoType){ + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} +#endif + +/* +!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT +!f> interface +!f> subroutine get_cpuid_set(simdSet, n) & +!f> bind(C, name="get_cpuid_set") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int), value :: n +!f> integer(kind=c_int) :: simdSet(n) +!f> end subroutine +!f> end interface +!f>#endif +*/ +void get_cpuid_set(int *set, int nlength){ + + get_cpu_manufacturer(set); + + + // Code below taken from http://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set/7495023#7495023 + + // Misc. + bool HW_MMX; + bool HW_x64; + bool HW_ABM; // Advanced Bit Manipulation + bool HW_RDRAND; + bool HW_BMI1; + bool HW_BMI2; + bool HW_ADX; + bool HW_PREFETCHWT1; + + // SIMD: 128-bit + bool HW_SSE; + bool HW_SSE2; + bool HW_SSE3; + bool HW_SSSE3; + bool HW_SSE41; + bool HW_SSE42; + bool HW_SSE4a; + bool HW_AES; + bool HW_SHA; + + // SIMD: 256-bit + bool HW_AVX; + bool HW_XOP; + bool HW_FMA3; + bool HW_FMA4; + bool HW_AVX2; + // SIMD: 512-bit + bool HW_AVX512F; // AVX512 Foundation + bool HW_AVX512CD; // AVX512 Conflict Detection + bool HW_AVX512PF; // AVX512 Prefetch + bool HW_AVX512ER; // AVX512 Exponential + Reciprocal + bool HW_AVX512VL; // AVX512 Vector Length Extensions + bool HW_AVX512BW; // AVX512 Byte + Word + bool HW_AVX512DQ; // AVX512 Doubleword + Quadword + bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add + bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions + + int info[4]; +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + + cpuid(info, 0); + int nIds = info[0]; + + cpuid(info, 0x80000000); + unsigned nExIds = info[0]; +#endif + // Detect Features + if (nIds >= 0x00000001){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x00000001); +#endif + HW_MMX = (info[3] & ((int)1 << 23)) != 0; + HW_SSE = (info[3] & ((int)1 << 25)) != 0; + HW_SSE2 = (info[3] & ((int)1 << 26)) != 0; + HW_SSE3 = (info[2] & ((int)1 << 0)) != 0; + + HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0; + HW_SSE41 = (info[2] & ((int)1 << 19)) != 0; + HW_SSE42 = (info[2] & ((int)1 << 20)) != 0; + HW_AES = (info[2] & ((int)1 << 25)) != 0; + + HW_AVX = (info[2] & ((int)1 << 28)) != 0; + HW_FMA3 = (info[2] & ((int)1 << 12)) != 0; + HW_RDRAND = (info[2] & ((int)1 << 30)) != 0; + } + if (nIds >= 0x00000007){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x00000007); +#endif + HW_AVX2 = (info[1] & ((int)1 << 5)) != 0; + + HW_BMI1 = (info[1] & ((int)1 << 3)) != 0; + HW_BMI2 = (info[1] & ((int)1 << 8)) != 0; + HW_ADX = (info[1] & ((int)1 << 19)) != 0; + HW_SHA = (info[1] & ((int)1 << 29)) != 0; + HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0; + + HW_AVX512F = (info[1] & ((int)1 << 16)) != 0; + HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0; + HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0; + HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0; + HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0; + HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0; + HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0; + HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0; + HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0; + } + + if (nExIds >= 0x80000001){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x80000001); +#endif + HW_x64 = (info[3] & ((int)1 << 29)) != 0; + HW_ABM = (info[2] & ((int)1 << 5)) != 0; + HW_SSE4a = (info[2] & ((int)1 << 6)) != 0; + HW_FMA4 = (info[2] & ((int)1 << 16)) != 0; + HW_XOP = (info[2] & ((int)1 << 11)) != 0; + } + + //allways allow GENERIC + set[GENERIC_INSTR -1] =1; + + // the rest depends on the CPU + if (HW_SSE42) { + set[SSE_INSTR - 1] = 1; + } + if (HW_AVX) { + set[AVX_INSTR - 1] = 1; + } + if (HW_AVX2) { + set[AVX2_INSTR - 1] = 1; + } + if (HW_AVX512F) { + set[AVX512_INSTR -1] = 1; + } + +} + + diff --git a/src/helpers/mod_simd_kernel.F90 b/src/helpers/mod_simd_kernel.F90 new file mode 100644 index 0000000000000000000000000000000000000000..5d956b8cf623f1631c66b43da35c34b6f35457c3 --- /dev/null +++ b/src/helpers/mod_simd_kernel.F90 @@ -0,0 +1,171 @@ +! This file is part of ELPA. +! +! The ELPA library was originally created by the ELPA consortium, +! consisting of the following organizations: +! +! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +! - Bergische Universität Wuppertal, Lehrstuhl für angewandte +! Informatik, +! - Technische Universität München, Lehrstuhl für Informatik mit +! Schwerpunkt Wissenschaftliches Rechnen , +! - Fritz-Haber-Institut, Berlin, Abt. Theorie, +! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +! and +! - IBM Deutschland GmbH +! +! +! More information can be found here: +! http://elpa.rzg.mpg.de/ +! +! ELPA is free software: you can redistribute it and/or modify +! it under the terms of the version 3 of the license of the +! GNU Lesser General Public License as published by the Free +! Software Foundation. +! +! ELPA is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with ELPA. If not, see +! +! ELPA reflects a substantial effort on the part of the original +! ELPA consortium, and we ask you to respect the spirit of the +! license that we chose: i.e., please contribute any changes you +! may have back to the original ELPA library distribution, and keep +! any derivatives of ELPA under the same license that we chose for +! the original distribution, the GNU Lesser General Public License. +! +! This file was written by A. Marek, MPCDF + +#include "config-f90.h" +#include "elpa/elpa_simd_constants.h" + +module simd_kernel + use elpa_constants + use iso_c_binding + + integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS) + integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR) + integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS) + integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR) + + contains + + function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index) + + use iso_c_binding + implicit none + + integer(kind=c_int), intent(in) :: kernel + integer(kind=c_int) :: simd_set_index + + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP) = BLUEGENE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ) = BLUEGENE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR + + simd_set_index = realKernels_to_simdTable(kernel) + + + end + + function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel) + + use iso_c_binding + implicit none + + + integer(kind=c_int) :: kernel + integer(kind=c_int), intent(in) :: simd_set_index + + simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC + simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP + simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2 + simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2 + simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2 + simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2 + simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU + simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2 + simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 + simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2 + + kernel = simdTable_to_realKernels(simd_set_index) + + end + + function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index) + + use iso_c_binding + implicit none + integer(kind=c_int), intent(in) :: kernel + integer(kind=c_int) :: simd_set_index + + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC) = GENERIC_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP) = BLUEGENE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ) = BLUEGENE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1) = AVX_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2) = AVX_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1) = AVX2_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) = AVX2_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1) = AVX512_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) = AVX512_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR + + + simd_set_index = complexKernels_to_simdTable(kernel) + + end + + function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel) + + use iso_c_binding + implicit none + integer(kind=c_int) :: kernel + integer(kind=c_int), intent(in) :: simd_set_index + + simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC + simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP + simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1 + simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1 + simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1 + simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 + simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU + + kernel = simdTable_to_complexKernels(simd_set_index) + + end + +end module + diff --git a/src/helpers/print_build_config.c b/src/helpers/print_build_config.c index a2285a75556378f5e15ed4255979fb52c2d29961..c7f9a62d9a61e63bc6d82999d24ee05ea538984e 100644 --- a/src/helpers/print_build_config.c +++ b/src/helpers/print_build_config.c @@ -1,3 +1,51 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + + #include "config.h" #include "elpa/elpa_build_config.h" #include