Commit 6ede2442 authored by Andreas Marek's avatar Andreas Marek

Error in setting specific kernel at compile time

In case of SSE/AVX/AVX2 it could happen that more than one kernel
(since some depend on other kernels, e.g. block 6 on block 4 and
block 2) were called
parent efe005da
...@@ -800,10 +800,14 @@ fi ...@@ -800,10 +800,14 @@ fi
if test x"${use_specific_complex_kernel}" = x"no" ; then if test x"${use_specific_complex_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)]) AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi fi
if test x"${use_specific_real_kernel}" = x"no" ; then if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi fi
LT_INIT LT_INIT
......
...@@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
endif endif
! check whether choosen kernel is allowed ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
if (my_pe == 0) then if (my_pe == 0) then
...@@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo enddo
write(error_unit,*) " " write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !" ! check whether generic kernel is defined
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
else
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif endif
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
endif endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
...@@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo enddo
write(error_unit,*) " " write(error_unit,*) " "
write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" ! check whether generic kernel is defined
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
else
THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL
endif endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......
...@@ -76,14 +76,16 @@ module ELPA2_utilities ...@@ -76,14 +76,16 @@ module ELPA2_utilities
REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, & REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, & REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6 REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,&
DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, & COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
...@@ -115,10 +117,114 @@ module ELPA2_utilities ...@@ -115,10 +117,114 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else #else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else /* WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif #endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
character(35), parameter, dimension(number_of_real_kernels) :: & character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", & REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GENERIC_SIMPLE ", & "REAL_ELPA_KERNEL_GENERIC_SIMPLE ", &
...@@ -149,10 +255,86 @@ module ELPA2_utilities ...@@ -149,10 +255,86 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else #else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif #endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
character(35), parameter, dimension(number_of_complex_kernels) :: & character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", & COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", & "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", &
......
...@@ -250,6 +250,8 @@ module compute_hh_trafo_complex ...@@ -250,6 +250,8 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2))
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 1, -1 do j = ncols, 1, -1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -260,6 +262,8 @@ module compute_hh_trafo_complex ...@@ -260,6 +262,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width) bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
...@@ -270,6 +274,8 @@ module compute_hh_trafo_complex ...@@ -270,6 +274,8 @@ module compute_hh_trafo_complex
if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. & if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2))
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 1, -1 do j = ncols, 1, -1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -280,6 +286,8 @@ module compute_hh_trafo_complex ...@@ -280,6 +286,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width) bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......
...@@ -218,6 +218,8 @@ module compute_hh_trafo_real ...@@ -218,6 +218,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -229,6 +231,8 @@ module compute_hh_trafo_real ...@@ -229,6 +231,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -239,6 +243,8 @@ module compute_hh_trafo_real ...@@ -239,6 +243,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -250,6 +256,8 @@ module compute_hh_trafo_real ...@@ -250,6 +256,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -322,6 +330,8 @@ module compute_hh_trafo_real ...@@ -322,6 +330,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
...@@ -354,6 +364,9 @@ module compute_hh_trafo_real ...@@ -354,6 +364,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif #endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -364,6 +377,8 @@ module compute_hh_trafo_real ...@@ -364,6 +377,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
...@@ -396,6 +411,9 @@ module compute_hh_trafo_real ...@@ -396,6 +411,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif #endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......
...@@ -129,7 +129,7 @@ ...@@ -129,7 +129,7 @@
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
print *,"GENERIC SIMPLE kernel for real matrices" print *,"GENERIC SIMPLE kernel for real matrices"
#endif #endif
#ifdef WITH_REAL_SSE_KERNEL #ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
print *,"SSE ASSEMBLER kernel for real matrices" print *,"SSE ASSEMBLER kernel for real matrices"
#endif #endif
#ifdef WITH_REAL_BGP_KERNEL #ifdef WITH_REAL_BGP_KERNEL
...@@ -174,7 +174,7 @@ ...@@ -174,7 +174,7 @@
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
print *,"GENERIC SIMPLE kernel for complex matrices" print *,"GENERIC SIMPLE kernel for complex matrices"
#endif #endif
#ifdef WITH_COMPLEX_SSE_KERNEL #ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices" print *,"SSE ASSEMBLER kernel for complex matrices"
#endif #endif
......
...@@ -287,8 +287,61 @@ program test_complex2 ...@@ -287,8 +287,61 @@ program test_complex2
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
na_cols, & na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_GENERIC_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC)
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
COMPLEX_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#endif
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
if (.not.(success)) then if (.not.(success)) then
write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..." write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
......
...@@ -278,9 +278,85 @@ program test_real2 ...@@ -278,9 +278,85 @@ program test_real2
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
na_cols, & na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
REAL_ELPA_KERNEL_GENERIC)
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
REAL_ELPA_KERNEL_GENERIC_SIMPLE) REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
REAL_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL