Commit 6ede2442 authored by Andreas Marek's avatar Andreas Marek

Error in setting specific kernel at compile time

In case of SSE/AVX/AVX2 it could happen that more than one kernel
(since some depend on other kernels, e.g. block 6 on block 4 and
block 2) were called
parent efe005da
......@@ -800,10 +800,14 @@ fi
if test x"${use_specific_complex_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
......@@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
endif
! check whether choosen kernel is allowed
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
if (my_pe == 0) then
......@@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
else
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......@@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
else
THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......
......@@ -76,14 +76,16 @@ module ELPA2_utilities
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,&
DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -115,10 +117,114 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else /* WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GENERIC_SIMPLE ", &
......@@ -149,10 +255,86 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", &
......
......@@ -250,6 +250,8 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -260,6 +262,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......@@ -270,6 +274,8 @@ module compute_hh_trafo_complex
if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -280,6 +286,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......
......@@ -218,6 +218,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -229,6 +231,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -239,6 +243,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -250,6 +256,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -322,6 +330,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -354,6 +364,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -364,6 +377,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -396,6 +411,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......
......@@ -129,7 +129,7 @@
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
print *,"GENERIC SIMPLE kernel for real matrices"
#endif
#ifdef WITH_REAL_SSE_KERNEL
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
print *,"SSE ASSEMBLER kernel for real matrices"
#endif
#ifdef WITH_REAL_BGP_KERNEL
......@@ -174,7 +174,7 @@
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
print *,"GENERIC SIMPLE kernel for complex matrices"
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices"
#endif
......
......@@ -287,8 +287,61 @@ program test_complex2
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_GENERIC_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC)
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
COMPLEX_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#endif
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
#endif
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
#endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
if (.not.(success)) then
write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
......
......@@ -278,9 +278,85 @@ program test_real2
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
REAL_ELPA_KERNEL_GENERIC)
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
REAL_ELPA_KERNEL_SSE)
#endif
#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#endif
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK6)
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK4)
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
#endif
#endif
#endif
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK2)
#endif
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK4)
#endif
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
REAL_ELPA_KERNEL_SSE_BLOCK6)
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK2)
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK4)
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
REAL_ELPA_KERNEL_AVX_BLOCK6)
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_BGP_KERNEL
REAL_ELPA_KERNEL_BGP)
#endif
#ifdef WITH_REAL_BGQ_KERNEL
REAL_ELPA_KERNEL_BGQ)
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
if (.not.(success)) then
if (.not.(success)) then
write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
#ifdef WITH_MPI
call MPI_ABORT(mpi_comm_world, 1, mpierr)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment