Commit e36e3f50 authored by Andreas Marek's avatar Andreas Marek
Browse files

First implementation of AVX-512 kernels

For double precision real case the block2 and block4 have
been ported as a first test to AVX-512
parent b7e778b4
......@@ -2200,6 +2200,34 @@ gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-
- coverage_data
#real avx512 block2, complex avx2 block1 (emulated)
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block2-complex-avx_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -mavx512f -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block2-kernel-only --with-complex-avx_block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='100 25 16'
gortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block2-complex-avx_block1-kernel-special-gcov-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="--coverage -O3 -mavx512f -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block2-kernel-only --with-complex-avx_block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='150 50 16'
- ./ci_coverage_collect
artifacts:
paths:
- coverage_data
#real avx2 block2, complex avx2 block1 (emulated)
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
......@@ -2266,6 +2294,34 @@ gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-
paths:
- coverage_data
#real avx512 block4, complex avx block2 (emulated)
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -mavx512f -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block4-kernel-only --with-complex-avx_block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block4-complex-avx_block2-kernel-special-gcov-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="--coverage -O3 -mavx512f -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2_block4-kernel-only --with-complex-avx2_block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='150 50 16'
- ./ci_coverage_collect
artifacts:
paths:
- coverage_data
#real avx2 block4, complex avx2 block2 (emulated)
......
......@@ -12,9 +12,12 @@
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK2 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK4 16
#define ELPA2_REAL_KERNEL_AVX512_BLOCK6 17
#define ELPA2_REAL_KERNEL_GPU 18
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 18
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
......@@ -27,6 +30,9 @@
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2 13
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
#define ELPA2_COMPLEX_KERNEL_GPU 14
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 14
......@@ -3558,11 +3558,30 @@
stripe_width = (l_nev-1)/stripe_count + 1
#endif
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6) then
stripe_width = ((stripe_width+3)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
endif
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
! (16 * sizeof(float) == 64)
else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
endif
#endif
else ! GPUs are used
stripe_width = 256 ! Must be a multiple of 4
......
......@@ -79,6 +79,9 @@ module ELPA2_utilities
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6, &
REAL_ELPA_KERNEL_AVX512_BLOCK2, &
REAL_ELPA_KERNEL_AVX512_BLOCK4, REAL_ELPA_KERNEL_AVX512_BLOCK6, &
REAL_ELPA_KERNEL_GPU, DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
......@@ -87,6 +90,7 @@ module ELPA2_utilities
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX512_BLOCK1,COMPLEX_ELPA_KERNEL_AVX512_BLOCK2, &
COMPLEX_ELPA_KERNEL_GPU, DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -117,6 +121,10 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4 = ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX512_BLOCK2 = ELPA2_REAL_KERNEL_AVX512_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX512_BLOCK4 = ELPA2_REAL_KERNEL_AVX512_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX512_BLOCK6 = ELPA2_REAL_KERNEL_AVX512_BLOCK6
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GPU = ELPA2_REAL_KERNEL_GPU
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
......@@ -178,6 +186,22 @@ module ELPA2_utilities
#endif
#endif /* #if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK6
#else
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK4
#else
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
......@@ -248,6 +272,21 @@ module ELPA2_utilities
#endif
#endif /* #if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK6
#else
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK4
#else
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX512_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
......@@ -278,6 +317,9 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_AVX2_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX512_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX512_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX512_BLOCK6 ", &
"REAL_ELPA_KERNEL_GPU "/)
integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
......@@ -292,6 +334,9 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX512_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX512_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA2_COMPLEX_KERNEL_GPU
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
......@@ -341,6 +386,16 @@ module ELPA2_utilities
#endif
#endif /* defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX512_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX512_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX512_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX512_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
......@@ -396,6 +451,17 @@ module ELPA2_utilities
#endif
#endif /* defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX512_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX512_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX512_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX512_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
......@@ -416,6 +482,8 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX512_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX512_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_GPU "/)
integer(kind=ik), parameter :: &
......@@ -492,10 +560,26 @@ module ELPA2_utilities
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
#if WITH_REAL_AVX512_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX512_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX512_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif
/)
......@@ -557,10 +641,21 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_COMPLEX_AVX512_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX512_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
,1 &
#else
,0 &
,0 &
#endif
/)
......
......@@ -103,7 +103,7 @@ module compute_hh_trafo_real
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
use kernel_interfaces
#endif
implicit none
......@@ -189,6 +189,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
......@@ -345,6 +346,34 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_RRAL_AVX512_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,j+off+a_off-1,istripe)), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
......@@ -454,9 +483,10 @@ module compute_hh_trafo_real
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
......@@ -499,7 +529,55 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,j+off+a_off-3,istripe)), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe)), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......@@ -559,9 +637,10 @@ module compute_hh_trafo_real
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK6)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
......@@ -614,7 +693,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK6_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
endif ! GPU_KERNEL
......@@ -682,7 +761,7 @@ module compute_hh_trafo_real
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(AVX512) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -767,6 +846,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
......@@ -896,9 +977,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 ) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL) )
......@@ -919,7 +1001,32 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,j+off+a_off-1,istripe)), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL) ) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......@@ -1076,7 +1183,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......@@ -1195,7 +1302,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
endif ! GPU_KERNEL
......
......@@ -381,6 +381,19 @@ program test_real2_choose_kernel_with_api_double_precision
#endif
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK6)
#else
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK4)
#else
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK2)
#endif
#endif
#endif
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
......@@ -419,6 +432,17 @@ program test_real2_choose_kernel_with_api_double_precision
REAL_ELPA_KERNEL_AVX2_BLOCK6)
#endif
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK2)
#endif
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK4)
#endif
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX512_BLOCK6)
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment