Commit d4c65f38 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' of gitlab.mpcdf.mpg.de:elpa/elpa

parents cd033b34 281e2be8
......@@ -105,7 +105,7 @@ mpi-openmp-ftimings-redirect-real-generic-complex-generic-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -116,7 +116,7 @@ mpi-openmp-ftimings-redirect-real-generic-simple-complex-generic-simple-kernel-j
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -127,7 +127,7 @@ mpi-openmp-ftimings-redirect-real-sse_assembly-complex-sse_assembly-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -138,7 +138,7 @@ mpi-openmp-ftimings-redirect-real-sse_block2-complex-sse_block1-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block2-kernel-only --with-complex-sse_block1-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -149,7 +149,7 @@ mpi-openmp-ftimings-redirect-real-sse_block4-complex-sse_block2-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block4-kernel-only --with-complex-sse_block2-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -160,7 +160,7 @@ mpi-openmp-ftimings-redirect-real-sse_block6-complex-avx_block1-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block6-kernel-only --with-complex-avx_block1-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -171,7 +171,7 @@ mpi-openmp-ftimings-redirect-real-avx_block2-complex-avx_block2-kernel-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block2-kernel-only --with-complex-avx_block2-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -182,7 +182,7 @@ mpi-openmp-ftimings-redirect-real-avx_block4-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block4-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......@@ -193,7 +193,7 @@ mpi-openmp-ftimings-redirect-real-avx_block6-jobs:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block6-kernel-only
- make -j 8
- export OMP_NUJM_THREADS=2
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 50 16'
......
......@@ -256,80 +256,81 @@ elpa2_print_kernels@SUFFIX@_SOURCES = src/print_available_elpa2_kernels.F90 $(sh
elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib)
check_SCRIPTS = \
elpa1_test_real.sh \
elpa1_test_real_with_c.sh \
elpa2_test_real.sh \
elpa2_test_real_default_kernel.sh \
elpa1_test_complex.sh \
elpa2_test_complex.sh \
elpa2_test_complex_default_kernel.sh \
elpa2_test_real_default_kernel_qr_decomposition.sh \
elpa2_test_real_choose_kernel_with_api.sh \
elpa2_test_complex_choose_kernel_with_api.sh \
elpa1_test_real@SUFFIX@.sh \
elpa1_test_real_with_c@SUFFIX@.sh \
elpa2_test_real@SUFFIX@.sh \
elpa2_test_real_default_kernel@SUFFIX@.sh \
elpa1_test_complex@SUFFIX@.sh \
elpa2_test_complex@SUFFIX@.sh \
elpa2_test_complex_default_kernel@SUFFIX@.sh \
elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \
elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \
elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \
elpa2_print_kernels@SUFFIX@
if !WITH_OPENMP
check_SCRIPTS += \
elpa1_test_real_c_version.sh \
elpa1_test_complex_c_version.sh \
elpa2_test_real_c_version.sh \
elpa2_test_complex_c_version.sh
elpa1_test_real_c_version@SUFFIX@.sh \
elpa1_test_complex_c_version@SUFFIX@.sh \
elpa2_test_real_c_version@SUFFIX@.sh \
elpa2_test_complex_c_version@SUFFIX@.sh
endif
TESTS = $(check_SCRIPTS)
elpa1_test_real.sh:
echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real.sh
chmod +x elpa1_test_real.sh
elpa1_test_real@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real@SUFFIX@.sh
chmod +x elpa1_test_real@SUFFIX@.sh
elpa1_test_real_with_c.sh:
echo 'mpiexec -n 2 ./elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c.sh
chmod +x elpa1_test_real_with_c.sh
elpa1_test_real_with_c@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c@SUFFIX@.sh
chmod +x elpa1_test_real_with_c@SUFFIX@.sh
elpa2_test_real_c_version.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version.sh
chmod +x elpa2_test_real_c_version.sh
elpa2_test_real_c_version@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version@SUFFIX@.sh
chmod +x elpa2_test_real_c_version@SUFFIX@.sh
elpa2_test_complex_c_version.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version.sh
chmod +x elpa2_test_complex_c_version.sh
elpa2_test_complex_c_version@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version@SUFFIX@.sh
chmod +x elpa2_test_complex_c_version@SUFFIX@.sh
elpa1_test_real_c_version.sh:
echo 'mpiexec -n 2 ./elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version.sh
chmod +x elpa1_test_real_c_version.sh
elpa1_test_real_c_version@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version@SUFFIX@.sh
chmod +x elpa1_test_real_c_version@SUFFIX@.sh
elpa1_test_complex_c_version.sh:
echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version.sh
chmod +x elpa1_test_complex_c_version.sh
elpa2_test_real.sh:
echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real.sh
chmod +x elpa2_test_real.sh
elpa1_test_complex_c_version@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version@SUFFIX@.sh
chmod +x elpa1_test_complex_c_version@SUFFIX@.sh
elpa2_test_real@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real@SUFFIX@.sh
chmod +x elpa2_test_real@SUFFIX@.sh
elpa2_test_real_default_kernel.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel.sh
chmod +x elpa2_test_real_default_kernel.sh
elpa2_test_real_default_kernel@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel@SUFFIX@.sh
chmod +x elpa2_test_real_default_kernel@SUFFIX@.sh
elpa2_test_real_default_kernel_qr_decomposition.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition.sh
chmod +x elpa2_test_real_default_kernel_qr_decomposition.sh
elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh
chmod +x elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh
elpa2_test_real_choose_kernel_with_api.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api.sh
chmod +x elpa2_test_real_choose_kernel_with_api.sh
elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh
chmod +x elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh
elpa1_test_complex.sh:
echo 'mpiexec -n 2 ./elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex.sh
chmod +x elpa1_test_complex.sh
elpa1_test_complex@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex@SUFFIX@.sh
chmod +x elpa1_test_complex@SUFFIX@.sh
elpa2_test_complex.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex.sh
chmod +x elpa2_test_complex.sh
elpa2_test_complex@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex@SUFFIX@.sh
chmod +x elpa2_test_complex@SUFFIX@.sh
elpa2_test_complex_default_kernel.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel.sh
chmod +x elpa2_test_complex_default_kernel.sh
elpa2_test_complex_default_kernel@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel@SUFFIX@.sh
chmod +x elpa2_test_complex_default_kernel@SUFFIX@.sh
elpa2_test_complex_choose_kernel_with_api.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh
chmod +x elpa2_test_complex_choose_kernel_with_api.sh
elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh
chmod +x elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh
elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
......@@ -346,24 +347,16 @@ elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@
include doxygen.am
CLEANFILES = \
elpa-generated.h \
elpa1_test_real.sh \
elpa1_test_complex.sh \
elpa2_test_real.sh \
elpa2_test_real_default_kernel.sh \
elpa2_test_real_default_kernel_qr_decomposition.sh \
elpa2_test_complex.sh \
elpa2_test_complex_default_kernel.sh \
elpa2_test_real_choose_kernel_with_api.sh \
elpa2_test_complex_choose_kernel_with_api.sh \
elpa1_test_real_with_c.sh \
elpa1_test_real_c_version.sh \
elpa1_test_complex_c_version.sh \
elpa2_test_real_c_version.sh \
elpa2_test_complex_c_version.sh \
elpa1_test* \
elpa2_test*\
*.i
clean-local:
......
......@@ -800,10 +800,14 @@ fi
if test x"${use_specific_complex_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
else
AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
......@@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
endif
! check whether choosen kernel is allowed
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
if (my_pe == 0) then
......@@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
else
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......@@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
! check whether generic kernel is defined
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
else
THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......
......@@ -76,14 +76,16 @@ module ELPA2_utilities
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,&
DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -115,10 +117,114 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else /* WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GENERIC_SIMPLE ", &
......@@ -149,10 +255,86 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", &
......
......@@ -71,9 +71,9 @@ module compute_hh_trafo_complex
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("compute_hh_trafo_complex_cpu_openmp")
call timer%start("compute_hh_trafo_complex_cpu_openmp")
#else
call timer%stop("compute_hh_trafo_complex_cpu")
call timer%start("compute_hh_trafo_complex_cpu")
#endif
#endif
......@@ -250,6 +250,8 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -260,16 +262,20 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
......@@ -280,6 +286,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......
......@@ -218,6 +218,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -229,6 +231,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -239,6 +243,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -250,6 +256,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -322,6 +330,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -354,6 +364,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -364,6 +377,8 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -396,6 +411,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */