From dd47b5849bda8db240863a943f9d4159adb965ff Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Thu, 10 Oct 2019 13:46:30 +0200 Subject: [PATCH] HETEROGENOUS_CLUSTER support On heterogenous cluster, of nodes with different CPUs the _experimental_ feature (--enable-heterogenous-cluster-support) can be used: It compares the (Intel) cpuid set of all CPUs which are used by ELPA MPI processes and finds the SIMD instruction set, which is supported by all used CPUs. The ELPA 2stage back-transformation kernel (a.k.a "kernel") will be set accordingly on all MPI processes. This feature, can override the setting of the kernel done previously by the user! At the moment it will only work for Intel CPUs, i.e. clusters consisting of nodes with Intel CPUs and e.g. AMD CPUs are at the moment _NOT_ supported. Since this is an experimental feature, it might be dropped again in the future, if it turns out not to be useful for the users --- Makefile.am | 6 + configure.ac | 23 ++-- elpa/elpa_simd_constants.h | 13 ++ src/elpa2/elpa2.F90 | 2 +- src/elpa2/elpa2_template.F90 | 101 +++++++++++++- src/elpa_impl.F90 | 2 - src/helpers/get_cpuid_set.c | 221 +++++++++++++++++++++++++++++++ src/helpers/mod_simd_kernel.F90 | 171 ++++++++++++++++++++++++ src/helpers/print_build_config.c | 48 +++++++ 9 files changed, 569 insertions(+), 18 deletions(-) create mode 100644 elpa/elpa_simd_constants.h create mode 100644 src/helpers/get_cpuid_set.c create mode 100644 src/helpers/mod_simd_kernel.F90 diff --git a/Makefile.am b/Makefile.am index e35903d3..01d55200 100644 --- a/Makefile.am +++ b/Makefile.am @@ -177,6 +177,11 @@ if !HAVE_DETAILED_TIMINGS endif endif + +if HAVE_HETEROGENOUS_CLUSTER_SUPPORT + libelpa@SUFFIX@_private_la_SOURCES += src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90 +endif + if WITH_REAL_GENERIC_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real.F90 endif @@ -480,6 +485,7 @@ nobase_elpa_include_HEADERS = \ $(wildcard modules/*) \ src/helpers/lapack_interfaces.h \ src/helpers/scalapack_interfaces.h \ + elpa/elpa_simd_constants.h \ elpa/elpa.h \ elpa/elpa_generic.h \ elpa/elpa_legacy.h diff --git a/configure.ac b/configure.ac index 77675169..98a07035 100644 --- a/configure.ac +++ b/configure.ac @@ -189,23 +189,24 @@ fi AX_EXT -dnl mixed-cluster-support -AC_MSG_CHECKING(whether mixed-cluster-support should be enabled) -AC_ARG_ENABLE([mixed-cluster-support], - AS_HELP_STRING([--mixed-cluster-support], - [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty!]), +dnl heterogenous-cluster-support +AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled) +AC_ARG_ENABLE([heterogenous-cluster-support], + AS_HELP_STRING([--heterogenous-cluster-support], + [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]), [ if test x"$enableval" = x"yes"; then - enable_mixed_cluster_support=yes + enable_heterogenous_cluster_support=yes else - enable_mixed_cluster_support=no + enable_heterogenous_cluster_support=no fi ], - [enable_mixed_cluster_support="no"]) -AC_MSG_RESULT([$enable_mixed_cluster_support]) -if test x"${enable_mixed_cluster_support}" = x"yes"; then - AC_DEFINE([HAVE_MIXED_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs]) + [enable_heterogenous_cluster_support="no"]) +AC_MSG_RESULT([$enable_heterogenous_cluster_support]) +if test x"${enable_heterogenous_cluster_support}" = x"yes"; then + AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs]) fi +AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"]) AC_MSG_CHECKING(whether C compiler can use _Generic ) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ diff --git a/elpa/elpa_simd_constants.h b/elpa/elpa_simd_constants.h new file mode 100644 index 00000000..6049ee71 --- /dev/null +++ b/elpa/elpa_simd_constants.h @@ -0,0 +1,13 @@ +#define CPU_MANUFACTURER 1 +#define GENERIC_INSTR 2 +#define BLUEGENE_INSTR 3 +#define SSE_INSTR 4 +#define AVX_INSTR 5 +#define AVX2_INSTR 6 +#define AVX512_INSTR 7 +#define NVIDIA_INSTR 8 +#define VSX_INSTR 9 +#define ARCH64_INSTR 10 +#define SPARC_INSTR 11 + +#define NUMBER_OF_INSTR 12 diff --git a/src/elpa2/elpa2.F90 b/src/elpa2/elpa2.F90 index 9b428da7..f31f5063 100644 --- a/src/elpa2/elpa2.F90 +++ b/src/elpa2/elpa2.F90 @@ -80,7 +80,7 @@ module elpa2_impl #define DOUBLE_PRECISION 1 #include "../general/precision_macros.h" !------------------------------------------------------------------------------- -!> \brief elpasolve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach +!> \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach !> !> Parameters !> diff --git a/src/elpa2/elpa2_template.F90 b/src/elpa2/elpa2_template.F90 index ec96bd6e..b5981214 100644 --- a/src/elpa2/elpa2_template.F90 +++ b/src/elpa2/elpa2_template.F90 @@ -49,6 +49,9 @@ ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". + +#include "elpa/elpa_simd_constants.h" + function elpa_solve_evp_& &MATH_DATATYPE& &_& @@ -64,7 +67,9 @@ use cuda_functions use mod_check_for_gpu use elpa_omp - +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + use simd_kernel +#endif use iso_c_binding implicit none #include "../general/precision_kinds.F90" @@ -74,14 +79,14 @@ logical :: useQR logical :: useQRActual #endif - integer(kind=c_int) :: kernel + integer(kind=c_int) :: kernel, kernelByUser #ifdef USE_ASSUMED_SIZE MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*) - MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*) + MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*) #else MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols) - MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols) + MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols) #endif real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na) MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:) @@ -124,6 +129,12 @@ do_trans_to_band, do_trans_to_full integer(kind=ik) :: nrThreads +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR) +#endif + + + #if REALCASE == 1 #undef GPU_KERNEL #undef GENERIC_KERNEL @@ -377,6 +388,88 @@ #endif + ! consistency check: is user set kernel still identical with "kernel" or did + ! we change it above? This is a mess and should be cleaned up + call obj%get(KERNEL_STRING,kernelByUser,error) + if (error .ne. ELPA_OK) then + print *,"Problem getting option. Aborting..." + stop + endif + + if (kernelByUser .ne. kernel) then + call obj%set(KERNEL_STRING, kernel, error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + endif + +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + ! find a kernel which is supported on all used CPUs + ! at the moment this works only on Intel CPUs + simdSetAvailable(:) = 0 + call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR) +#ifdef WITH_MPI + call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr) +#endif + + ! compare user chosen kernel with possible kernels + call obj%get(KERNEL_STRING,kernelByUser,error) + if (error .ne. ELPA_OK) then + print *,"Problem getting option. Aborting..." + stop + endif + + ! map kernel to SIMD Set, and check whether this is set is available on all cores + +#if REALCASE == 1 + if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then +#endif +#if COMPLEXCASE == 1 + if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then +#endif + + ! if we are not purely running on Intel CPUs, this feature does not work at the moment + ! this restriction should be lifted step by step + if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then + if (my_pe == 0 ) then + write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support." + write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!" + write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now" + stop + endif + else + if (my_pe == 0 ) then + write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs" + write(error_unit,*) "ELPA will use another kernel..." + endif + + ! find best kernel available for supported instruction sets + do i = NUMBER_OF_INSTR, 2, -1 + if (simdSetAvailable(i) == 1) then + ! map to "best" kernel with this instruction set + ! this can be only done for kernels that ELPA has been configured to use +#if REALCASE == 1 + kernel = map_simd_instruction_to_real_kernel(i) +#endif +#if COMPLEXCASE == 1 + kernel = map_simd_instruction_to_complex_kernel(i) +#endif + if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then + call obj%set(KERNEL_STRING, kernel, error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel) + exit + endif + endif + enddo + endif + + endif +#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */ #if REALCASE == 1 call obj%get("qr",qr,error) diff --git a/src/elpa_impl.F90 b/src/elpa_impl.F90 index dcc0592e..4643a344 100644 --- a/src/elpa_impl.F90 +++ b/src/elpa_impl.F90 @@ -1799,7 +1799,6 @@ module elpa_impl #endif end select - !print *, "testing, before C call, ts_impl%current is ", ts_impl%current if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, & ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then @@ -1812,7 +1811,6 @@ module elpa_impl error = ELPA_ERROR_CANNOT_OPEN_FILE #endif endif - !print *, "testing, after C call, ts_impl%current is ", ts_impl%current end subroutine diff --git a/src/helpers/get_cpuid_set.c b/src/helpers/get_cpuid_set.c new file mode 100644 index 00000000..5e204f2d --- /dev/null +++ b/src/helpers/get_cpuid_set.c @@ -0,0 +1,221 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config.h" + +#include +#include +#include +#include + +#include "elpa/elpa_simd_constants.h" + +static inline void get_cpu_manufacturer(int *set) +{ + u_int32_t registers[4]; + registers[0] = 0; + asm volatile("cpuid": "=a" (registers[0]),"=b" (registers[1]),"=c" (registrers[3]),"=d" (registers[2]): "0" (registers[0]), "2" (registers[2]): "memory"); + + char str[13]="GenuineIntel\0"; + char manufacturer[13]; + + memcpy(manufacturer, registers[1], 12); + manufacturer[12] = '\0'; + + if (strcmp(manufacturer, str) == 0) { + set[CPU_MANUFACTURER - 1] = 1; + } else { + set[CPU_MANUFACTURER - 1] = 0; + } +} + +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT +#include +void cpuid(int info[4], int InfoType){ + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} +#endif + +/* +!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT +!f> interface +!f> subroutine get_cpuid_set(simdSet, n) & +!f> bind(C, name="get_cpuid_set") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int), value :: n +!f> integer(kind=c_int) :: simdSet(n) +!f> end subroutine +!f> end interface +!f>#endif +*/ +void get_cpuid_set(int *set, int nlength){ + + get_cpu_manufacturer(set); + + + // Code below taken from http://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set/7495023#7495023 + + // Misc. + bool HW_MMX; + bool HW_x64; + bool HW_ABM; // Advanced Bit Manipulation + bool HW_RDRAND; + bool HW_BMI1; + bool HW_BMI2; + bool HW_ADX; + bool HW_PREFETCHWT1; + + // SIMD: 128-bit + bool HW_SSE; + bool HW_SSE2; + bool HW_SSE3; + bool HW_SSSE3; + bool HW_SSE41; + bool HW_SSE42; + bool HW_SSE4a; + bool HW_AES; + bool HW_SHA; + + // SIMD: 256-bit + bool HW_AVX; + bool HW_XOP; + bool HW_FMA3; + bool HW_FMA4; + bool HW_AVX2; + // SIMD: 512-bit + bool HW_AVX512F; // AVX512 Foundation + bool HW_AVX512CD; // AVX512 Conflict Detection + bool HW_AVX512PF; // AVX512 Prefetch + bool HW_AVX512ER; // AVX512 Exponential + Reciprocal + bool HW_AVX512VL; // AVX512 Vector Length Extensions + bool HW_AVX512BW; // AVX512 Byte + Word + bool HW_AVX512DQ; // AVX512 Doubleword + Quadword + bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add + bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions + + int info[4]; +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + + cpuid(info, 0); + int nIds = info[0]; + + cpuid(info, 0x80000000); + unsigned nExIds = info[0]; +#endif + // Detect Features + if (nIds >= 0x00000001){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x00000001); +#endif + HW_MMX = (info[3] & ((int)1 << 23)) != 0; + HW_SSE = (info[3] & ((int)1 << 25)) != 0; + HW_SSE2 = (info[3] & ((int)1 << 26)) != 0; + HW_SSE3 = (info[2] & ((int)1 << 0)) != 0; + + HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0; + HW_SSE41 = (info[2] & ((int)1 << 19)) != 0; + HW_SSE42 = (info[2] & ((int)1 << 20)) != 0; + HW_AES = (info[2] & ((int)1 << 25)) != 0; + + HW_AVX = (info[2] & ((int)1 << 28)) != 0; + HW_FMA3 = (info[2] & ((int)1 << 12)) != 0; + HW_RDRAND = (info[2] & ((int)1 << 30)) != 0; + } + if (nIds >= 0x00000007){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x00000007); +#endif + HW_AVX2 = (info[1] & ((int)1 << 5)) != 0; + + HW_BMI1 = (info[1] & ((int)1 << 3)) != 0; + HW_BMI2 = (info[1] & ((int)1 << 8)) != 0; + HW_ADX = (info[1] & ((int)1 << 19)) != 0; + HW_SHA = (info[1] & ((int)1 << 29)) != 0; + HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0; + + HW_AVX512F = (info[1] & ((int)1 << 16)) != 0; + HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0; + HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0; + HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0; + HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0; + HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0; + HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0; + HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0; + HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0; + } + + if (nExIds >= 0x80000001){ +#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT + cpuid(info,0x80000001); +#endif + HW_x64 = (info[3] & ((int)1 << 29)) != 0; + HW_ABM = (info[2] & ((int)1 << 5)) != 0; + HW_SSE4a = (info[2] & ((int)1 << 6)) != 0; + HW_FMA4 = (info[2] & ((int)1 << 16)) != 0; + HW_XOP = (info[2] & ((int)1 << 11)) != 0; + } + + //allways allow GENERIC + set[GENERIC_INSTR -1] =1; + + // the rest depends on the CPU + if (HW_SSE42) { + set[SSE_INSTR - 1] = 1; + } + if (HW_AVX) { + set[AVX_INSTR - 1] = 1; + } + if (HW_AVX2) { + set[AVX2_INSTR - 1] = 1; + } + if (HW_AVX512F) { + set[AVX512_INSTR -1] = 1; + } + +} + + diff --git a/src/helpers/mod_simd_kernel.F90 b/src/helpers/mod_simd_kernel.F90 new file mode 100644 index 00000000..5d956b8c --- /dev/null +++ b/src/helpers/mod_simd_kernel.F90 @@ -0,0 +1,171 @@ +! This file is part of ELPA. +! +! The ELPA library was originally created by the ELPA consortium, +! consisting of the following organizations: +! +! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +! - Bergische Universität Wuppertal, Lehrstuhl für angewandte +! Informatik, +! - Technische Universität München, Lehrstuhl für Informatik mit +! Schwerpunkt Wissenschaftliches Rechnen , +! - Fritz-Haber-Institut, Berlin, Abt. Theorie, +! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +! and +! - IBM Deutschland GmbH +! +! +! More information can be found here: +! http://elpa.rzg.mpg.de/ +! +! ELPA is free software: you can redistribute it and/or modify +! it under the terms of the version 3 of the license of the +! GNU Lesser General Public License as published by the Free +! Software Foundation. +! +! ELPA is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with ELPA. If not, see +! +! ELPA reflects a substantial effort on the part of the original +! ELPA consortium, and we ask you to respect the spirit of the +! license that we chose: i.e., please contribute any changes you +! may have back to the original ELPA library distribution, and keep +! any derivatives of ELPA under the same license that we chose for +! the original distribution, the GNU Lesser General Public License. +! +! This file was written by A. Marek, MPCDF + +#include "config-f90.h" +#include "elpa/elpa_simd_constants.h" + +module simd_kernel + use elpa_constants + use iso_c_binding + + integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS) + integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR) + integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS) + integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR) + + contains + + function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index) + + use iso_c_binding + implicit none + + integer(kind=c_int), intent(in) :: kernel + integer(kind=c_int) :: simd_set_index + + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP) = BLUEGENE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ) = BLUEGENE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6) = SSE_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6) = AVX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6) = AVX2_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) = ARCH64_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6) = VSX_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR + realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR + + simd_set_index = realKernels_to_simdTable(kernel) + + + end + + function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel) + + use iso_c_binding + implicit none + + + integer(kind=c_int) :: kernel + integer(kind=c_int), intent(in) :: simd_set_index + + simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC + simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP + simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2 + simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2 + simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2 + simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2 + simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU + simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2 + simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 + simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2 + + kernel = simdTable_to_realKernels(simd_set_index) + + end + + function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index) + + use iso_c_binding + implicit none + integer(kind=c_int), intent(in) :: kernel + integer(kind=c_int) :: simd_set_index + + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC) = GENERIC_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP) = BLUEGENE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ) = BLUEGENE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2) = SSE_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1) = AVX_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2) = AVX_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1) = AVX2_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) = AVX2_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1) = AVX512_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) = AVX512_INSTR + complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR + + + simd_set_index = complexKernels_to_simdTable(kernel) + + end + + function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel) + + use iso_c_binding + implicit none + integer(kind=c_int) :: kernel + integer(kind=c_int), intent(in) :: simd_set_index + + simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC + simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP + simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1 + simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1 + simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1 + simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 + simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU + + kernel = simdTable_to_complexKernels(simd_set_index) + + end + +end module + diff --git a/src/helpers/print_build_config.c b/src/helpers/print_build_config.c index a2285a75..c7f9a62d 100644 --- a/src/helpers/print_build_config.c +++ b/src/helpers/print_build_config.c @@ -1,3 +1,51 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + + #include "config.h" #include "elpa/elpa_build_config.h" #include -- GitLab