ELPA_GPU branch: The new SSE kernels have not been considered when

setting default kernels

This fixes issue #16: due to a mess in setting the default kernels,
several kernels were called at the same time, which produces wrong
results
parent 0c328350
......@@ -882,10 +882,14 @@ fi
if test x"${use_specific_complex_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
......@@ -260,7 +260,7 @@ contains
THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
endif
! check whether choosen kernel is allowed
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
if (my_pe == 0) then
......@@ -276,10 +276,18 @@ contains
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
else
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
......@@ -588,7 +596,6 @@ contains
endif
endif
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
......@@ -615,10 +622,18 @@ contains
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
else
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
......
......@@ -79,7 +79,7 @@ module ELPA2_utilities
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6, &
REAL_ELPA_KERNEL_GPU
REAL_ELPA_KERNEL_GPU, DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
......@@ -88,7 +88,7 @@ module ELPA2_utilities
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
COMPLEX_ELPA_KERNEL_GPU
COMPLEX_ELPA_KERNEL_GPU, DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -135,15 +135,36 @@ module ELPA2_utilities
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
......@@ -171,15 +192,35 @@ module ELPA2_utilities
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
......@@ -241,12 +282,25 @@ module ELPA2_utilities
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
......@@ -270,12 +324,25 @@ module ELPA2_utilities
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#endif
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
......
......@@ -150,6 +150,7 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -303,6 +304,8 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -313,6 +316,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......@@ -323,6 +328,9 @@ module compute_hh_trafo_complex
if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -333,6 +341,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......
......@@ -299,6 +299,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -310,6 +312,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -321,6 +325,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -333,6 +339,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -403,6 +411,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -436,6 +446,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -446,6 +459,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -479,6 +494,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......
......@@ -235,7 +235,7 @@ program test_complex2_choose_kernel_with_api_double_precision
print *, " The settings are: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE"
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
print *, " The settings are: COMPLEX_ELPA_KERNEL_SSE"
#endif
......@@ -345,10 +345,38 @@ program test_complex2_choose_kernel_with_api_double_precision
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
COMPLEX_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#endif
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
......@@ -357,6 +385,8 @@ program test_complex2_choose_kernel_with_api_double_precision
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_GPU_VERSION
COMPLEX_ELPA_KERNEL_GPU)
#endif
......
......@@ -235,7 +235,7 @@ program test_complex2_choose_kernel_with_api_single_precision
print *, " The settings are: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE"
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
print *, " The settings are: COMPLEX_ELPA_KERNEL_SSE"
#endif
......@@ -345,10 +345,39 @@ program test_complex2_choose_kernel_with_api_single_precision
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
COMPLEX_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#endif
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
......@@ -357,6 +386,8 @@ program test_complex2_choose_kernel_with_api_single_precision
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_GPU_VERSION
COMPLEX_ELPA_KERNEL_GPU)
#endif
......
......@@ -231,7 +231,7 @@ program test_real2_choose_kernel_with_api_double_precision
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
print *, " The settings are: REAL_ELPA_KERNEL_GENERIC_SIMPLE"
#endif
#ifdef WITH_REAL_GENERIC_SSE_KERNEL
#ifdef WITH_REAL_GENERIC_SSE_ASSEMBLY_KERNEL
print *, " The settings are: REAL_ELPA_KERNEL_SSE"
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
......@@ -340,9 +340,47 @@ program test_real2_choose_kernel_with_api_double_precision
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_REAL_SSE_KERNEL
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
REAL_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#endif
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK6)
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK4)
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif
#endif
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#endif
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
......@@ -356,6 +394,9 @@ program test_real2_choose_kernel_with_api_double_precision
REAL_ELPA_KERNEL_AVX_BLOCK6)
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_BGP_KERNEL
REAL_ELPA_KERNEL_BGP)
#endif
......
......@@ -231,7 +231,7 @@ program test_real2_choose_kernel_with_api_single_precision
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
print *, " The settings are: REAL_ELPA_KERNEL_GENERIC_SIMPLE"
#endif
#ifdef WITH_REAL_GENERIC_SSE_KERNEL
#ifdef WITH_REAL_GENERIC_SSE_ASSEMBLY_KERNEL
print *, " The settings are: REAL_ELPA_KERNEL_SSE"
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
......@@ -340,9 +340,47 @@ program test_real2_choose_kernel_with_api_single_precision
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_REAL_SSE_KERNEL
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
REAL_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#endif
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK6)
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK4)
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif
#endif
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#endif
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
......@@ -356,6 +394,9 @@ program test_real2_choose_kernel_with_api_single_precision
REAL_ELPA_KERNEL_AVX_BLOCK6)
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_BGP_KERNEL
REAL_ELPA_KERNEL_BGP)
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment