Commit 69792b15 authored by Andreas Marek's avatar Andreas Marek

Introduction of new SSE kernels with different blocking

The SSE kernels with blocking of 2,4,6 (real case) and 1,2 (complex)
case are now available by default

Thus the following changes have been done
- introduce new macros in configure.ac and Makefile.am
- renmae the AVX kernels in AVX_AVX2 (they also support AVX2)
- introduce new files with SSE kernel
- introduce new kernel parameters !
- make the SSE kernels callable

The results are identical with previous kernels
parent 1b6f931f
......@@ -79,31 +79,46 @@ if WITH_COMPLEX_SSE_KERNEL
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
#endif
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
nobase_elpa_include_HEADERS = $(wildcard modules/*)
......
......@@ -196,12 +196,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
can_compile_sse=no
install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
......@@ -477,7 +491,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment
nn
character(len=256) :: homedir
call get_environment_variable("HOME",homedir)
end program
......@@ -570,6 +583,15 @@ dnl real kernels
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
......@@ -600,6 +622,12 @@ dnl complex kernels
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
......@@ -641,6 +669,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
......@@ -656,6 +699,16 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
......
......@@ -3,11 +3,14 @@
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_NUMBER_OF_REAL_KERNELS 8
#define ELPA2_NUMBER_OF_REAL_KERNELS 11
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
......@@ -15,7 +18,9 @@
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 9
......@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
......@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
extern "C" {
//Forward declaration
......
......@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
......@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
extern "C" {
//Forward declaration
......
This diff is collapsed.
This diff is collapsed.
......@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
......@@ -74,6 +78,8 @@
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif
#endif
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
......
......@@ -59,11 +59,14 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
......@@ -78,6 +81,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
......
......@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
......@@ -78,6 +82,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
//Forward declaration
static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -71,13 +71,16 @@ module ELPA2_utilities
public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -97,6 +100,9 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP
integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ
integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_REAL_KERNEL_SSE_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4 = ELPA2_REAL_KERNEL_SSE_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6 = ELPA2_REAL_KERNEL_SSE_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
......@@ -112,6 +118,9 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_BGP ", &
"REAL_ELPA_KERNEL_BGQ ", &
"REAL_ELPA_KERNEL_SSE ", &
"REAL_ELPA_KERNEL_SSE_BLOCK2 ", &
"REAL_ELPA_KERNEL_SSE_BLOCK4 ", &
"REAL_ELPA_KERNEL_SSE_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK6 "/)
......@@ -122,6 +131,8 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_BGP = ELPA2_COMPLEX_KERNEL_BGP
integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ = ELPA2_COMPLEX_KERNEL_BGQ
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK1 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
......@@ -136,6 +147,8 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_BGP ", &
"COMPLEX_ELPA_KERNEL_BGQ ", &
"COMPLEX_ELPA_KERNEL_SSE ", &
"COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "/)
......@@ -167,20 +180,36 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
#if WITH_REAL_SSE_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
#if WITH_REAL_SSE_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
#if WITH_REAL_SSE_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
/)
......@@ -212,16 +241,27 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
#if WITH_COMPLEX_SSE_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
#if WITH_COMPLEX_SSE_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
/)
!******
......
......@@ -90,6 +90,34 @@ module compute_hh_trafo_complex
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
......@@ -213,6 +241,25 @@ module compute_hh_trafo_complex
! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
......
......@@ -104,6 +104,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or. &
......@@ -208,6 +209,25 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......@@ -292,6 +312,47 @@ module compute_hh_trafo_real
#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
......@@ -333,6 +394,61 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK6) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......
......@@ -91,6 +91,7 @@
print *," "
print *,"This ELPA2 is build with"
#if DATATYPE == REAL
#ifdef HAVE_AVX2
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
......@@ -103,18 +104,18 @@
print *,"AVX2 optimized kernel (6 blocking) for real matrices"
#endif
#else
#else /* no HAVE_AVX2 */
#ifdef HAVE_AVX
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
print *,"AVX2 optimized kernel (2 blocking) for real matrices"
print *,"AVX optimized kernel (2 blocking) for real matrices"
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
print *,"AVX2 optimized kernel (4 blocking) for real matrices"
print *,"AVX optimized kernel (4 blocking) for real matrices"
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
print *,"AVX2 optimized kernel (6 blocking) for real matrices"
print *,"AVX optimized kernel (6 blocking) for real matrices"
#endif
#endif
......@@ -151,7 +152,7 @@
print *,"AVX2 optimized kernel (1 blocking) for complex matrices"
#endif
#else
#else /* no HAVE_AVX2 */
#ifdef HAVE_AVX
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment