diff --git a/configure.ac b/configure.ac index 15b951f9ce56c069a9c817d3e0ff31a689b6e1cf..3e1f0236f31971ea0d8600a2990419bbd78332e0 100644 --- a/configure.ac +++ b/configure.ac @@ -800,10 +800,14 @@ fi if test x"${use_specific_complex_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)]) +else + AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)]) fi if test x"${use_specific_real_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) +else + AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)]) fi LT_INIT diff --git a/src/elpa2.F90 b/src/elpa2.F90 index 9989f7b9740e0d028ab214a5dcdf10804058b889..6a9f598e6b9eb56ff79ffb06fdf184175e117d46 100644 --- a/src/elpa2.F90 +++ b/src/elpa2.F90 @@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() endif - ! check whether choosen kernel is allowed + ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (my_pe == 0) then @@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & enddo write(error_unit,*) " " - write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !" + ! check whether generic kernel is defined + if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then + write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !" + else + write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used" + endif + endif ! my_pe == 0 + if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then + THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC + else + THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL endif - THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC - endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 @@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & enddo write(error_unit,*) " " - write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + ! check whether generic kernel is defined + if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then + write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + else + write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used" + endif + endif ! my_pe == 0 + if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then + THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + else + THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL endif - THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 diff --git a/src/elpa2_utilities.F90 b/src/elpa2_utilities.F90 index 82bc873e4090c35d4c53e58a0ac42517c5f3d7f4..27d586077b71f93b7c6ebbeb25656a2599c4fbe6 100644 --- a/src/elpa2_utilities.F90 +++ b/src/elpa2_utilities.F90 @@ -76,14 +76,16 @@ module ELPA2_utilities REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, & REAL_ELPA_KERNEL_AVX2_BLOCK2, & - REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6 + REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,& + DEFAULT_REAL_ELPA_KERNEL public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, & - COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 + COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, & + DEFAULT_COMPLEX_ELPA_KERNEL public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES @@ -115,10 +117,114 @@ module ELPA2_utilities integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6 #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) + +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#endif +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE +#endif +#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 +#else + +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ + +#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 #else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ + +#ifdef WITH_REAL_BGP_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP +#endif +#ifdef WITH_REAL_BGQ_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#else /* WITH_REAL_AVX_BLOCK2_KERNEL */ + +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #endif +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 +#else +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ + +#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 +#else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ + +#ifdef WITH_REAL_BGP_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP +#endif +#ifdef WITH_REAL_BGQ_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */ + character(35), parameter, dimension(number_of_real_kernels) :: & REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", & "REAL_ELPA_KERNEL_GENERIC_SIMPLE ", & @@ -149,10 +255,86 @@ module ELPA2_utilities integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) + +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +! go through all kernels and set them +#ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC +#endif +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ + +#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 #else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ + +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +! go through all kernels and set them +#ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #endif +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ + +#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 +#else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ + character(35), parameter, dimension(number_of_complex_kernels) :: & COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", & "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", & diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90 index 2949c4183ad18ca7458ccc70dd953dfa7986db1b..481299be42428f79381050e100b0168a09712988 100644 --- a/src/mod_compute_hh_trafo_complex.F90 +++ b/src/mod_compute_hh_trafo_complex.F90 @@ -250,6 +250,8 @@ module compute_hh_trafo_complex #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2)) ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP @@ -260,6 +262,8 @@ module compute_hh_trafo_complex bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_SSE_BLOCK2)) */ + #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) endif #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ @@ -270,6 +274,8 @@ module compute_hh_trafo_complex if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. & (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP @@ -280,6 +286,8 @@ module compute_hh_trafo_complex bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(COMPLEX_ELPA_KERNEL_AVX_BLOCK2) && !defined(COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) */ + #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) endif #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90 index db6d003bb7fe77e914e182bfeeae6697ce4b490a..0bd211032780320e5dfef5c083f573d6d77c7b42 100644 --- a/src/mod_compute_hh_trafo_real.F90 +++ b/src/mod_compute_hh_trafo_real.F90 @@ -218,6 +218,8 @@ module compute_hh_trafo_real #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) do j = ncols, 2, -2 w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) @@ -229,6 +231,8 @@ module compute_hh_trafo_real w, nbw, nl, stripe_width, nbw) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -239,6 +243,8 @@ module compute_hh_trafo_real if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL)) do j = ncols, 2, -2 w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) @@ -250,6 +256,8 @@ module compute_hh_trafo_real w, nbw, nl, stripe_width, nbw) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -322,6 +330,8 @@ module compute_hh_trafo_real #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS do j = ncols, 4, -4 w(:,1) = bcast_buffer(1:nbw,j+off) @@ -354,6 +364,9 @@ module compute_hh_trafo_real if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif + +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -364,6 +377,8 @@ module compute_hh_trafo_real if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS do j = ncols, 4, -4 w(:,1) = bcast_buffer(1:nbw,j+off) @@ -396,6 +411,9 @@ module compute_hh_trafo_real if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif + +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ diff --git a/test/fortran_test_programs/elpa_test_programs_print_headers.X90 b/test/fortran_test_programs/elpa_test_programs_print_headers.X90 index 2312b6124b9a6520979b4fcff26f71ade6082ef4..3f646a1d18fbd8ddbedf8151d191fd7e641bffd3 100644 --- a/test/fortran_test_programs/elpa_test_programs_print_headers.X90 +++ b/test/fortran_test_programs/elpa_test_programs_print_headers.X90 @@ -129,7 +129,7 @@ #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL print *,"GENERIC SIMPLE kernel for real matrices" #endif -#ifdef WITH_REAL_SSE_KERNEL +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL print *,"SSE ASSEMBLER kernel for real matrices" #endif #ifdef WITH_REAL_BGP_KERNEL @@ -174,7 +174,7 @@ #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL print *,"GENERIC SIMPLE kernel for complex matrices" #endif -#ifdef WITH_COMPLEX_SSE_KERNEL +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL print *,"SSE ASSEMBLER kernel for complex matrices" #endif diff --git a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 index 22984d674105e22b68f49330e80243e16e408ab7..206c27b5f39a496756cc64af80a1ebd75262f6cd 100644 --- a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 +++ b/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 @@ -287,8 +287,61 @@ program test_complex2 success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & na_cols, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ +#ifdef WITH_COMPLEX_GENERIC_KERNEL + COMPLEX_ELPA_KERNEL_GENERIC) +#endif + +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) +#endif + +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + COMPLEX_ELPA_KERNEL_SSE) +#endif + +#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK2) +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK1) +#endif +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK2) +#else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK1) +#endif +#endif + +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK1) +#endif + +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK2) +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK1) +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK2) +#endif + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ if (.not.(success)) then write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..." diff --git a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 index 8fa709bced33cdcfa2afdc73391fa6fe007003b8..589d9a7eaee2353495cea5d5a95a8dc7d79e0310 100644 --- a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 +++ b/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 @@ -278,9 +278,85 @@ program test_real2 success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & na_cols, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + REAL_ELPA_KERNEL_GENERIC_SIMPLE) +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL + REAL_ELPA_KERNEL_GENERIC) +#endif + +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL REAL_ELPA_KERNEL_GENERIC_SIMPLE) +#endif + +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + REAL_ELPA_KERNEL_SSE) +#endif +#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK6) +#else +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK4) +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK2) +#endif +#endif +#endif +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK6) +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK4) +#else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK2) +#endif +#endif +#endif + +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK2) +#endif + +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK4) +#endif + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK6) +#endif + +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK2) +#endif + +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK4) +#endif + +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK6) +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_BGP_KERNEL + REAL_ELPA_KERNEL_BGP) +#endif + +#ifdef WITH_REAL_BGQ_KERNEL + REAL_ELPA_KERNEL_BGQ) +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ - if (.not.(success)) then + if (.not.(success)) then write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..." #ifdef WITH_MPI call MPI_ABORT(mpi_comm_world, 1, mpierr)