...
 
Commits (4)
  • Andreas Marek's avatar
    Configure evaluates now the cpuid set · 78593813
    Andreas Marek authored
    78593813
  • Andreas Marek's avatar
    Add mixed-cluster option to configure · 973d0296
    Andreas Marek authored
    973d0296
  • Andreas Marek's avatar
    HETEROGENOUS_CLUSTER support · dd47b584
    Andreas Marek authored
    On heterogenous cluster, of nodes with different CPUs the _experimental_
    feature (--enable-heterogenous-cluster-support) can be used:
    
    It compares the (Intel) cpuid set of all CPUs which are used by ELPA MPI
    processes and finds the SIMD instruction set, which is supported by all
    used CPUs. The ELPA 2stage back-transformation kernel (a.k.a "kernel")
    will be set accordingly on all MPI processes.
    
    This feature, can override the setting of the kernel done previously by
    the user!
    
    At the moment it will only work for Intel CPUs, i.e. clusters consisting
    of nodes with Intel CPUs and e.g. AMD CPUs are at the moment _NOT_
    supported.
    
    Since this is an experimental feature, it might be dropped again in the
    future, if it turns out not to be useful for the users
    dd47b584
  • Andreas Marek's avatar
    Merge branch 'auto_detect' into 'master_pre_stage' · 65e33ecf
    Andreas Marek authored
    Auto detect
    
    See merge request !20
    65e33ecf
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -177,6 +177,11 @@ if !HAVE_DETAILED_TIMINGS
endif
endif
if HAVE_HETEROGENOUS_CLUSTER_SUPPORT
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real.F90
endif
......@@ -480,6 +485,7 @@ nobase_elpa_include_HEADERS = \
$(wildcard modules/*) \
src/helpers/lapack_interfaces.h \
src/helpers/scalapack_interfaces.h \
elpa/elpa_simd_constants.h \
elpa/elpa.h \
elpa/elpa_generic.h \
elpa/elpa_legacy.h
......
This diff is collapsed.
......@@ -187,6 +187,27 @@ if test x"$c11_standard" = x"no"; then
fi
fi
AX_EXT
dnl heterogenous-cluster-support
AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled)
AC_ARG_ENABLE([heterogenous-cluster-support],
AS_HELP_STRING([--heterogenous-cluster-support],
[allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]),
[
if test x"$enableval" = x"yes"; then
enable_heterogenous_cluster_support=yes
else
enable_heterogenous_cluster_support=no
fi
],
[enable_heterogenous_cluster_support="no"])
AC_MSG_RESULT([$enable_heterogenous_cluster_support])
if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs])
fi
AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
......@@ -745,7 +766,7 @@ ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[disable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -996,7 +1017,7 @@ if test x"${need_sse_assembly}" = x"yes"; then
rm -f ./conftest.o
AC_MSG_RESULT([${can_compile_sse_asm_double}])
if test x"$can_compile_sse_asm_double" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
if test x"${want_single_precision}" = x"yes" ; then
......@@ -1012,7 +1033,7 @@ if test x"${need_sse_assembly}" = x"yes"; then
rm -f ./conftest.o
AC_MSG_RESULT([${can_compile_sse_asm_single}])
if test x"$can_compile_sse_asm_single" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
fi
fi
......@@ -1035,7 +1056,7 @@ if test x"${need_avx}" = x"yes"; then
)
AC_MSG_RESULT([${can_compile_avx}])
if test x"$can_compile_avx" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX, try with --disable-avx, or adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile a test program with AVX, try with --disable-avx, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
fi
......@@ -1057,7 +1078,7 @@ if test x"${need_avx2}" = x"yes"; then
)
AC_MSG_RESULT([${can_compile_avx2}])
if test x"$can_compile_avx2" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX2, try with --disable-avx2, or adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile a test program with AVX2, try with --disable-avx2, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
......@@ -1079,7 +1100,7 @@ if test x"${need_avx512}" = x"yes"; then
)
AC_MSG_RESULT([${can_compile_avx512}])
if test x"$can_compile_avx512" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
......@@ -1584,6 +1605,41 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
])
if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a x"${need_sse}" = x"no"; then
echo " "
AC_MSG_WARN([You did not request SSE support (--enable-sse), but your local CPU supports it.])
AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
echo " "
fi
if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a x"${need_sse_assembly}" = x"no"; then
echo " "
AC_MSG_WARN([You did not request SSE-ASSEMBLY support (--enable-sse-assembly), but your local CPU supports it.])
AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
echo " "
fi
if test x"${ax_cv_have_avx_cpu_ext}" = x"yes" -a x"${need_avx}" = x"no"; then
echo " "
AC_MSG_WARN([You did not request AVX support (--enable-avx), but your local CPU supports it.])
AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
echo " "
fi
if test x"${ax_cv_have_avx2_cpu_ext}" = x"yes" -a x"${need_avx2}" = x"no"; then
echo " "
AC_MSG_WARN([You did not request AVX2 support (--enable-avx2), but your local CPU supports it.])
AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
echo " "
fi
if test x"${ax_cv_have_avx512f_cpu_ext}" = x"yes" -a x"${need_avx512}" = x"no"; then
echo " "
AC_MSG_WARN([You did not request AVX512 support (--enable-avx512), but your local CPU supports it.])
AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
echo " "
fi
#echo " "
#echo "***********************************************************************"
#echo "* ELPA 2018.11.001 will be the last release where the old, obsolete, *"
......
#define CPU_MANUFACTURER 1
#define GENERIC_INSTR 2
#define BLUEGENE_INSTR 3
#define SSE_INSTR 4
#define AVX_INSTR 5
#define AVX2_INSTR 6
#define AVX512_INSTR 7
#define NVIDIA_INSTR 8
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define NUMBER_OF_INSTR 12
This diff is collapsed.
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_GCC_X86_AVX_XGETBV
#
# DESCRIPTION
#
# On later x86 processors with AVX SIMD support, with gcc or a compiler
# that has a compatible syntax for inline assembly instructions, run a
# small program that executes the xgetbv instruction with input OP. This
# can be used to detect if the OS supports AVX instruction usage.
#
# On output, the values of the eax and edx registers are stored as
# hexadecimal strings as "eax:edx" in the cache variable
# ax_cv_gcc_x86_avx_xgetbv.
#
# If the xgetbv instruction fails (because you are running a
# cross-compiler, or because you are not using gcc, or because you are on
# a processor that doesn't have this instruction),
# ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
#
# This macro mainly exists to be used in AX_EXT.
#
# LICENSE
#
# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
#
# As a special exception, the respective Autoconf Macro's copyright owner
# gives unlimited permission to copy, distribute and modify the configure
# scripts that are the output of Autoconf when processing the Macro. You
# need not follow the terms of the GNU General Public License when using
# or distributing such scripts, even though portions of the text of the
# Macro appear in them. The GNU General Public License (GPL) does govern
# all other use of the material that constitutes the Autoconf Macro.
#
# This special exception to the GPL applies to versions of the Autoconf
# Macro released by the Autoconf Archive. When you make and distribute a
# modified version of the Autoconf Macro, you may extend this special
# exception to the GPL to apply to your modified version as well.
#serial 3
AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
[AC_REQUIRE([AC_PROG_CC])
AC_LANG_PUSH([C])
AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
[AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
int op = $1, eax, edx;
FILE *f;
/* Opcodes for xgetbv */
__asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0"
: "=a" (eax), "=d" (edx)
: "c" (op));
f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
fprintf(f, "%x:%x\n", eax, edx);
fclose(f);
return 0;
])],
[ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
[ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
[ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
AC_LANG_POP([C])
])
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_GCC_X86_CPUID(OP)
# AX_GCC_X86_CPUID_COUNT(OP, COUNT)
#
# DESCRIPTION
#
# On Pentium and later x86 processors, with gcc or a compiler that has a
# compatible syntax for inline assembly instructions, run a small program
# that executes the cpuid instruction with input OP. This can be used to
# detect the CPU type. AX_GCC_X86_CPUID_COUNT takes an additional COUNT
# parameter that gets passed into register ECX before calling cpuid.
#
# On output, the values of the eax, ebx, ecx, and edx registers are stored
# as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable
# ax_cv_gcc_x86_cpuid_OP.
#
# If the cpuid instruction fails (because you are running a
# cross-compiler, or because you are not using gcc, or because you are on
# a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP
# is set to the string "unknown".
#
# This macro mainly exists to be used in AX_GCC_ARCHFLAG.
#
# LICENSE
#
# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
# Copyright (c) 2008 Matteo Frigo
# Copyright (c) 2015 Michael Petch <mpetch@capp-sysware.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
#
# As a special exception, the respective Autoconf Macro's copyright owner
# gives unlimited permission to copy, distribute and modify the configure
# scripts that are the output of Autoconf when processing the Macro. You
# need not follow the terms of the GNU General Public License when using
# or distributing such scripts, even though portions of the text of the
# Macro appear in them. The GNU General Public License (GPL) does govern
# all other use of the material that constitutes the Autoconf Macro.
#
# This special exception to the GPL applies to versions of the Autoconf
# Macro released by the Autoconf Archive. When you make and distribute a
# modified version of the Autoconf Macro, you may extend this special
# exception to the GPL to apply to your modified version as well.
#serial 10
AC_DEFUN([AX_GCC_X86_CPUID],
[AX_GCC_X86_CPUID_COUNT($1, 0)
])
AC_DEFUN([AX_GCC_X86_CPUID_COUNT],
[AC_REQUIRE([AC_PROG_CC])
AC_LANG_PUSH([C])
AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
[AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
int op = $1, level = $2, eax, ebx, ecx, edx;
FILE *f;
__asm__ __volatile__ ("xchg %%ebx, %1\n"
"cpuid\n"
"xchg %%ebx, %1\n"
: "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
: "a" (op), "2" (level));
f = fopen("conftest_cpuid", "w"); if (!f) return 1;
fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
fclose(f);
return 0;
])],
[ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
[ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
[ax_cv_gcc_x86_cpuid_$1=unknown])])
AC_LANG_POP([C])
])
......@@ -80,7 +80,7 @@ module elpa2_impl
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
!-------------------------------------------------------------------------------
!> \brief elpasolve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!> \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
......
......@@ -49,6 +49,9 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "elpa/elpa_simd_constants.h"
function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
......@@ -64,7 +67,9 @@
use cuda_functions
use mod_check_for_gpu
use elpa_omp
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
use simd_kernel
#endif
use iso_c_binding
implicit none
#include "../general/precision_kinds.F90"
......@@ -74,14 +79,14 @@
logical :: useQR
logical :: useQRActual
#endif
integer(kind=c_int) :: kernel
integer(kind=c_int) :: kernel, kernelByUser
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
......@@ -124,6 +129,12 @@
do_trans_to_band, do_trans_to_full
integer(kind=ik) :: nrThreads
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR)
#endif
#if REALCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
......@@ -377,6 +388,88 @@
#endif
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
call obj%get(KERNEL_STRING,kernelByUser,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
if (kernelByUser .ne. kernel) then
call obj%set(KERNEL_STRING, kernel, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
endif
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
! find a kernel which is supported on all used CPUs
! at the moment this works only on Intel CPUs
simdSetAvailable(:) = 0
call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR)
#ifdef WITH_MPI
call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr)
#endif
! compare user chosen kernel with possible kernels
call obj%get(KERNEL_STRING,kernelByUser,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
! map kernel to SIMD Set, and check whether this is set is available on all cores
#if REALCASE == 1
if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then
#endif
#if COMPLEXCASE == 1
if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then
#endif
! if we are not purely running on Intel CPUs, this feature does not work at the moment
! this restriction should be lifted step by step
if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then
if (my_pe == 0 ) then
write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support."
write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!"
write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now"
stop
endif
else
if (my_pe == 0 ) then
write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs"
write(error_unit,*) "ELPA will use another kernel..."
endif
! find best kernel available for supported instruction sets
do i = NUMBER_OF_INSTR, 2, -1
if (simdSetAvailable(i) == 1) then
! map to "best" kernel with this instruction set
! this can be only done for kernels that ELPA has been configured to use
#if REALCASE == 1
kernel = map_simd_instruction_to_real_kernel(i)
#endif
#if COMPLEXCASE == 1
kernel = map_simd_instruction_to_complex_kernel(i)
#endif
if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then
call obj%set(KERNEL_STRING, kernel, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel)
exit
endif
endif
enddo
endif
endif
#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */
#if REALCASE == 1
call obj%get("qr",qr,error)
......
......@@ -1799,7 +1799,6 @@ module elpa_impl
#endif
end select
!print *, "testing, before C call, ts_impl%current is ", ts_impl%current
if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then
......@@ -1812,7 +1811,6 @@ module elpa_impl
error = ELPA_ERROR_CANNOT_OPEN_FILE
#endif
endif
!print *, "testing, after C call, ts_impl%current is ", ts_impl%current
end subroutine
......
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include "elpa/elpa_simd_constants.h"
static inline void get_cpu_manufacturer(int *set)
{
u_int32_t registers[4];
registers[0] = 0;
asm volatile("cpuid": "=a" (registers[0]),"=b" (registers[1]),"=c" (registrers[3]),"=d" (registers[2]): "0" (registers[0]), "2" (registers[2]): "memory");
char str[13]="GenuineIntel\0";
char manufacturer[13];
memcpy(manufacturer, registers[1], 12);
manufacturer[12] = '\0';
if (strcmp(manufacturer, str) == 0) {
set[CPU_MANUFACTURER - 1] = 1;
} else {
set[CPU_MANUFACTURER - 1] = 0;
}
}
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
#include <cpuid.h>
void cpuid(int info[4], int InfoType){
__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
}
#endif
/*
!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
!f> interface
!f> subroutine get_cpuid_set(simdSet, n) &
!f> bind(C, name="get_cpuid_set")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int), value :: n
!f> integer(kind=c_int) :: simdSet(n)
!f> end subroutine
!f> end interface
!f>#endif
*/
void get_cpuid_set(int *set, int nlength){
get_cpu_manufacturer(set);
// Code below taken from http://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set/7495023#7495023
// Misc.
bool HW_MMX;
bool HW_x64;
bool HW_ABM; // Advanced Bit Manipulation
bool HW_RDRAND;
bool HW_BMI1;
bool HW_BMI2;
bool HW_ADX;
bool HW_PREFETCHWT1;
// SIMD: 128-bit
bool HW_SSE;
bool HW_SSE2;
bool HW_SSE3;
bool HW_SSSE3;
bool HW_SSE41;
bool HW_SSE42;
bool HW_SSE4a;
bool HW_AES;
bool HW_SHA;
// SIMD: 256-bit
bool HW_AVX;
bool HW_XOP;
bool HW_FMA3;
bool HW_FMA4;
bool HW_AVX2;
// SIMD: 512-bit
bool HW_AVX512F; // AVX512 Foundation
bool HW_AVX512CD; // AVX512 Conflict Detection
bool HW_AVX512PF; // AVX512 Prefetch
bool HW_AVX512ER; // AVX512 Exponential + Reciprocal
bool HW_AVX512VL; // AVX512 Vector Length Extensions
bool HW_AVX512BW; // AVX512 Byte + Word
bool HW_AVX512DQ; // AVX512 Doubleword + Quadword
bool HW_AVX512IFMA; // AVX512 Integer 52-bit Fused Multiply-Add
bool HW_AVX512VBMI; // AVX512 Vector Byte Manipulation Instructions
int info[4];
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
cpuid(info, 0);
int nIds = info[0];
cpuid(info, 0x80000000);
unsigned nExIds = info[0];
#endif
// Detect Features
if (nIds >= 0x00000001){
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
cpuid(info,0x00000001);
#endif
HW_MMX = (info[3] & ((int)1 << 23)) != 0;
HW_SSE = (info[3] & ((int)1 << 25)) != 0;
HW_SSE2 = (info[3] & ((int)1 << 26)) != 0;
HW_SSE3 = (info[2] & ((int)1 << 0)) != 0;
HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0;
HW_SSE41 = (info[2] & ((int)1 << 19)) != 0;
HW_SSE42 = (info[2] & ((int)1 << 20)) != 0;
HW_AES = (info[2] & ((int)1 << 25)) != 0;
HW_AVX = (info[2] & ((int)1 << 28)) != 0;
HW_FMA3 = (info[2] & ((int)1 << 12)) != 0;
HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
}
if (nIds >= 0x00000007){
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
cpuid(info,0x00000007);
#endif
HW_AVX2 = (info[1] & ((int)1 << 5)) != 0;
HW_BMI1 = (info[1] & ((int)1 << 3)) != 0;
HW_BMI2 = (info[1] & ((int)1 << 8)) != 0;
HW_ADX = (info[1] & ((int)1 << 19)) != 0;
HW_SHA = (info[1] & ((int)1 << 29)) != 0;
HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
HW_AVX512F = (info[1] & ((int)1 << 16)) != 0;
HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0;
HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0;
HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0;
HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0;
HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0;
HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0;
HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0;
HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0;
}
if (nExIds >= 0x80000001){
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
cpuid(info,0x80000001);
#endif
HW_x64 = (info[3] & ((int)1 << 29)) != 0;
HW_ABM = (info[2] & ((int)1 << 5)) != 0;
HW_SSE4a = (info[2] & ((int)1 << 6)) != 0;
HW_FMA4 = (info[2] & ((int)1 << 16)) != 0;
HW_XOP = (info[2] & ((int)1 << 11)) != 0;
}
//allways allow GENERIC
set[GENERIC_INSTR -1] =1;
// the rest depends on the CPU
if (HW_SSE42) {
set[SSE_INSTR - 1] = 1;
}
if (HW_AVX) {
set[AVX_INSTR - 1] = 1;
}
if (HW_AVX2) {
set[AVX2_INSTR - 1] = 1;
}
if (HW_AVX512F) {
set[AVX512_INSTR -1] = 1;
}
}
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#include "config-f90.h"
#include "elpa/elpa_simd_constants.h"
module simd_kernel
use elpa_constants
use iso_c_binding
integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS)
integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR)
integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS)
integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR)
contains
function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index)
use iso_c_binding
implicit none
integer(kind=c_int), intent(in) :: kernel
integer(kind=c_int) :: simd_set_index
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP) = BLUEGENE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ) = BLUEGENE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6) = SSE_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6) = AVX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6) = AVX2_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) = ARCH64_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6) = VSX_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR
simd_set_index = realKernels_to_simdTable(kernel)
end
function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel)
use iso_c_binding
implicit none
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_realKernels(GENERIC_INSTR) = ELPA_2STAGE_REAL_GENERIC
simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
simdTable_to_realKernels(SSE_INSTR) = ELPA_2STAGE_REAL_SSE_BLOCK2
simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU
simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
kernel = simdTable_to_realKernels(simd_set_index)
end
function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index)
use iso_c_binding
implicit none
integer(kind=c_int), intent(in) :: kernel
integer(kind=c_int) :: simd_set_index
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC) = GENERIC_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP) = BLUEGENE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ) = BLUEGENE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2) = SSE_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1) = AVX_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2) = AVX_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1) = AVX2_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) = AVX2_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1) = AVX512_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) = AVX512_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR
simd_set_index = complexKernels_to_simdTable(kernel)
end
function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel)
use iso_c_binding
implicit none
integer(kind=c_int) :: kernel
integer(kind=c_int), intent(in) :: simd_set_index
simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU
kernel = simdTable_to_complexKernels(simd_set_index)
end
end module
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config.h"
#include "elpa/elpa_build_config.h"
#include <stdio.h>
......