diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f0fe901dcf1eb429d491ac93c5e4cd2b2172f51..adb9d60c5c01f1e4f9a1f77ab2f6a1ec9c207b9f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -105,7 +105,7 @@ mpi-openmp-ftimings-redirect-real-generic-complex-generic-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -116,7 +116,7 @@ mpi-openmp-ftimings-redirect-real-generic-simple-complex-generic-simple-kernel-j - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -127,7 +127,7 @@ mpi-openmp-ftimings-redirect-real-sse_assembly-complex-sse_assembly-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-generic-kernel-only --with-complex-generic-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -138,7 +138,7 @@ mpi-openmp-ftimings-redirect-real-sse_block2-complex-sse_block1-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block2-kernel-only --with-complex-sse_block1-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -149,7 +149,7 @@ mpi-openmp-ftimings-redirect-real-sse_block4-complex-sse_block2-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block4-kernel-only --with-complex-sse_block2-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -160,7 +160,7 @@ mpi-openmp-ftimings-redirect-real-sse_block6-complex-avx_block1-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-sse_block6-kernel-only --with-complex-avx_block1-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -171,7 +171,7 @@ mpi-openmp-ftimings-redirect-real-avx_block2-complex-avx_block2-kernel-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block2-kernel-only --with-complex-avx_block2-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -182,7 +182,7 @@ mpi-openmp-ftimings-redirect-real-avx_block4-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block4-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' @@ -193,7 +193,7 @@ mpi-openmp-ftimings-redirect-real-avx_block6-jobs: - ./autogen.sh - ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" --enable-openmp --with-ftimings --with-redirect --with-real-avx_block6-kernel-only - make -j 8 - - export OMP_NUJM_THREADS=2 + - export OMP_NUM_THREADS=2 - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1500 50 16' diff --git a/Makefile.am b/Makefile.am index d9767263d94574a5d9b70d25ac3700b70752b7cf..738855b21c29711bbeccbd47fcd21cadfa0475b9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -256,80 +256,81 @@ elpa2_print_kernels@SUFFIX@_SOURCES = src/print_available_elpa2_kernels.F90 $(sh elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib) check_SCRIPTS = \ - elpa1_test_real.sh \ - elpa1_test_real_with_c.sh \ - elpa2_test_real.sh \ - elpa2_test_real_default_kernel.sh \ - elpa1_test_complex.sh \ - elpa2_test_complex.sh \ - elpa2_test_complex_default_kernel.sh \ - elpa2_test_real_default_kernel_qr_decomposition.sh \ - elpa2_test_real_choose_kernel_with_api.sh \ - elpa2_test_complex_choose_kernel_with_api.sh \ + elpa1_test_real@SUFFIX@.sh \ + elpa1_test_real_with_c@SUFFIX@.sh \ + elpa2_test_real@SUFFIX@.sh \ + elpa2_test_real_default_kernel@SUFFIX@.sh \ + elpa1_test_complex@SUFFIX@.sh \ + elpa2_test_complex@SUFFIX@.sh \ + elpa2_test_complex_default_kernel@SUFFIX@.sh \ + elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \ + elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \ + elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \ elpa2_print_kernels@SUFFIX@ if !WITH_OPENMP check_SCRIPTS += \ - elpa1_test_real_c_version.sh \ - elpa1_test_complex_c_version.sh \ - elpa2_test_real_c_version.sh \ - elpa2_test_complex_c_version.sh + elpa1_test_real_c_version@SUFFIX@.sh \ + elpa1_test_complex_c_version@SUFFIX@.sh \ + elpa2_test_real_c_version@SUFFIX@.sh \ + elpa2_test_complex_c_version@SUFFIX@.sh endif + TESTS = $(check_SCRIPTS) -elpa1_test_real.sh: - echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real.sh - chmod +x elpa1_test_real.sh +elpa1_test_real@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real@SUFFIX@.sh + chmod +x elpa1_test_real@SUFFIX@.sh -elpa1_test_real_with_c.sh: - echo 'mpiexec -n 2 ./elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c.sh - chmod +x elpa1_test_real_with_c.sh +elpa1_test_real_with_c@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c@SUFFIX@.sh + chmod +x elpa1_test_real_with_c@SUFFIX@.sh -elpa2_test_real_c_version.sh: - echo 'mpiexec -n 2 ./elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version.sh - chmod +x elpa2_test_real_c_version.sh +elpa2_test_real_c_version@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version@SUFFIX@.sh + chmod +x elpa2_test_real_c_version@SUFFIX@.sh -elpa2_test_complex_c_version.sh: - echo 'mpiexec -n 2 ./elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version.sh - chmod +x elpa2_test_complex_c_version.sh +elpa2_test_complex_c_version@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version@SUFFIX@.sh + chmod +x elpa2_test_complex_c_version@SUFFIX@.sh -elpa1_test_real_c_version.sh: - echo 'mpiexec -n 2 ./elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version.sh - chmod +x elpa1_test_real_c_version.sh +elpa1_test_real_c_version@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version@SUFFIX@.sh + chmod +x elpa1_test_real_c_version@SUFFIX@.sh -elpa1_test_complex_c_version.sh: - echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version.sh - chmod +x elpa1_test_complex_c_version.sh -elpa2_test_real.sh: - echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real.sh - chmod +x elpa2_test_real.sh +elpa1_test_complex_c_version@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version@SUFFIX@.sh + chmod +x elpa1_test_complex_c_version@SUFFIX@.sh +elpa2_test_real@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real@SUFFIX@.sh + chmod +x elpa2_test_real@SUFFIX@.sh -elpa2_test_real_default_kernel.sh: - echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel.sh - chmod +x elpa2_test_real_default_kernel.sh +elpa2_test_real_default_kernel@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel@SUFFIX@.sh + chmod +x elpa2_test_real_default_kernel@SUFFIX@.sh -elpa2_test_real_default_kernel_qr_decomposition.sh: - echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition.sh - chmod +x elpa2_test_real_default_kernel_qr_decomposition.sh +elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh + chmod +x elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh -elpa2_test_real_choose_kernel_with_api.sh: - echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api.sh - chmod +x elpa2_test_real_choose_kernel_with_api.sh +elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh + chmod +x elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh -elpa1_test_complex.sh: - echo 'mpiexec -n 2 ./elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex.sh - chmod +x elpa1_test_complex.sh +elpa1_test_complex@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex@SUFFIX@.sh + chmod +x elpa1_test_complex@SUFFIX@.sh -elpa2_test_complex.sh: - echo 'mpiexec -n 2 ./elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex.sh - chmod +x elpa2_test_complex.sh +elpa2_test_complex@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex@SUFFIX@.sh + chmod +x elpa2_test_complex@SUFFIX@.sh -elpa2_test_complex_default_kernel.sh: - echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel.sh - chmod +x elpa2_test_complex_default_kernel.sh +elpa2_test_complex_default_kernel@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel@SUFFIX@.sh + chmod +x elpa2_test_complex_default_kernel@SUFFIX@.sh -elpa2_test_complex_choose_kernel_with_api.sh: - echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh - chmod +x elpa2_test_complex_choose_kernel_with_api.sh +elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh + chmod +x elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@ @@ -346,24 +347,16 @@ elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@ +mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@ + + include doxygen.am CLEANFILES = \ elpa-generated.h \ - elpa1_test_real.sh \ - elpa1_test_complex.sh \ - elpa2_test_real.sh \ - elpa2_test_real_default_kernel.sh \ - elpa2_test_real_default_kernel_qr_decomposition.sh \ - elpa2_test_complex.sh \ - elpa2_test_complex_default_kernel.sh \ - elpa2_test_real_choose_kernel_with_api.sh \ - elpa2_test_complex_choose_kernel_with_api.sh \ - elpa1_test_real_with_c.sh \ - elpa1_test_real_c_version.sh \ - elpa1_test_complex_c_version.sh \ - elpa2_test_real_c_version.sh \ - elpa2_test_complex_c_version.sh \ + elpa1_test* \ + elpa2_test*\ *.i clean-local: diff --git a/configure.ac b/configure.ac index 15b951f9ce56c069a9c817d3e0ff31a689b6e1cf..3e1f0236f31971ea0d8600a2990419bbd78332e0 100644 --- a/configure.ac +++ b/configure.ac @@ -800,10 +800,14 @@ fi if test x"${use_specific_complex_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)]) +else + AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)]) fi if test x"${use_specific_real_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) +else + AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)]) fi LT_INIT diff --git a/src/elpa2.F90 b/src/elpa2.F90 index 9989f7b9740e0d028ab214a5dcdf10804058b889..6a9f598e6b9eb56ff79ffb06fdf184175e117d46 100644 --- a/src/elpa2.F90 +++ b/src/elpa2.F90 @@ -214,7 +214,7 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() endif - ! check whether choosen kernel is allowed + ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (my_pe == 0) then @@ -230,10 +230,18 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & enddo write(error_unit,*) " " - write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !" + ! check whether generic kernel is defined + if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then + write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !" + else + write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used" + endif + endif ! my_pe == 0 + if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then + THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC + else + THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL endif - THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC - endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 @@ -433,9 +441,18 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & enddo write(error_unit,*) " " - write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + ! check whether generic kernel is defined + if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then + write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + else + write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used" + endif + endif ! my_pe == 0 + if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then + THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + else + THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL endif - THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 diff --git a/src/elpa2_utilities.F90 b/src/elpa2_utilities.F90 index 82bc873e4090c35d4c53e58a0ac42517c5f3d7f4..27d586077b71f93b7c6ebbeb25656a2599c4fbe6 100644 --- a/src/elpa2_utilities.F90 +++ b/src/elpa2_utilities.F90 @@ -76,14 +76,16 @@ module ELPA2_utilities REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, & REAL_ELPA_KERNEL_AVX2_BLOCK2, & - REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6 + REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,& + DEFAULT_REAL_ELPA_KERNEL public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, & - COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 + COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, & + DEFAULT_COMPLEX_ELPA_KERNEL public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES @@ -115,10 +117,114 @@ module ELPA2_utilities integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6 #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) + +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#endif +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE +#endif +#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 +#else + +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ + +#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 #else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ + +#ifdef WITH_REAL_BGP_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP +#endif +#ifdef WITH_REAL_BGQ_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#else /* WITH_REAL_AVX_BLOCK2_KERNEL */ + +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #endif +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 +#else +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ + +#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 +#else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 +#endif +#endif +#endif +#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ + +#ifdef WITH_REAL_BGP_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP +#endif +#ifdef WITH_REAL_BGQ_KERNEL + integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */ + character(35), parameter, dimension(number_of_real_kernels) :: & REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", & "REAL_ELPA_KERNEL_GENERIC_SIMPLE ", & @@ -149,10 +255,86 @@ module ELPA2_utilities integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) + +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +! go through all kernels and set them +#ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC +#endif +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ + +#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 #else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ + +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +! go through all kernels and set them +#ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #endif +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE +#endif +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE +#endif + +#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ + +#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 +#else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 +#endif +#endif +#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ + character(35), parameter, dimension(number_of_complex_kernels) :: & COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", & "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", & diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90 index 2949c4183ad18ca7458ccc70dd953dfa7986db1b..9f5f68a9322fea642fe1090d62ab7fe42b123c77 100644 --- a/src/mod_compute_hh_trafo_complex.F90 +++ b/src/mod_compute_hh_trafo_complex.F90 @@ -71,9 +71,9 @@ module compute_hh_trafo_complex #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP - call timer%stop("compute_hh_trafo_complex_cpu_openmp") + call timer%start("compute_hh_trafo_complex_cpu_openmp") #else - call timer%stop("compute_hh_trafo_complex_cpu") + call timer%start("compute_hh_trafo_complex_cpu") #endif #endif @@ -250,6 +250,8 @@ module compute_hh_trafo_complex #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP @@ -260,16 +262,20 @@ module compute_hh_trafo_complex bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) endif #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ -#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */ +#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */ #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. & (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP @@ -280,6 +286,8 @@ module compute_hh_trafo_complex bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) endif #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90 index db6d003bb7fe77e914e182bfeeae6697ce4b490a..0bd211032780320e5dfef5c083f573d6d77c7b42 100644 --- a/src/mod_compute_hh_trafo_real.F90 +++ b/src/mod_compute_hh_trafo_real.F90 @@ -218,6 +218,8 @@ module compute_hh_trafo_real #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) do j = ncols, 2, -2 w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) @@ -229,6 +231,8 @@ module compute_hh_trafo_real w, nbw, nl, stripe_width, nbw) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -239,6 +243,8 @@ module compute_hh_trafo_real if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL)) do j = ncols, 2, -2 w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) @@ -250,6 +256,8 @@ module compute_hh_trafo_real w, nbw, nl, stripe_width, nbw) #endif enddo +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -322,6 +330,8 @@ module compute_hh_trafo_real #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS do j = ncols, 4, -4 w(:,1) = bcast_buffer(1:nbw,j+off) @@ -354,6 +364,9 @@ module compute_hh_trafo_real if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif + +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ @@ -364,6 +377,8 @@ module compute_hh_trafo_real if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS do j = ncols, 4, -4 w(:,1) = bcast_buffer(1:nbw,j+off) @@ -396,6 +411,9 @@ module compute_hh_trafo_real if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif + +#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */ + #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) endif #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ diff --git a/test/fortran_test_programs/elpa_test_programs_print_headers.X90 b/test/fortran_test_programs/elpa_test_programs_print_headers.X90 index 2312b6124b9a6520979b4fcff26f71ade6082ef4..3f646a1d18fbd8ddbedf8151d191fd7e641bffd3 100644 --- a/test/fortran_test_programs/elpa_test_programs_print_headers.X90 +++ b/test/fortran_test_programs/elpa_test_programs_print_headers.X90 @@ -129,7 +129,7 @@ #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL print *,"GENERIC SIMPLE kernel for real matrices" #endif -#ifdef WITH_REAL_SSE_KERNEL +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL print *,"SSE ASSEMBLER kernel for real matrices" #endif #ifdef WITH_REAL_BGP_KERNEL @@ -174,7 +174,7 @@ #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL print *,"GENERIC SIMPLE kernel for complex matrices" #endif -#ifdef WITH_COMPLEX_SSE_KERNEL +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL print *,"SSE ASSEMBLER kernel for complex matrices" #endif diff --git a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 index 22984d674105e22b68f49330e80243e16e408ab7..206c27b5f39a496756cc64af80a1ebd75262f6cd 100644 --- a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 +++ b/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 @@ -287,8 +287,61 @@ program test_complex2 success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & na_cols, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & +#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ +#ifdef WITH_COMPLEX_GENERIC_KERNEL + COMPLEX_ELPA_KERNEL_GENERIC) +#endif + +#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL + COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) +#endif + +#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL + COMPLEX_ELPA_KERNEL_SSE) +#endif + +#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL + +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK2) +#else +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK1) +#endif +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK2) +#else +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK1) +#endif +#endif + +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK1) +#endif + +#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_SSE_BLOCK2) +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK1) +#endif + +#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL + COMPLEX_ELPA_KERNEL_AVX_BLOCK2) +#endif + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ if (.not.(success)) then write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..." diff --git a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 index 8fa709bced33cdcfa2afdc73391fa6fe007003b8..589d9a7eaee2353495cea5d5a95a8dc7d79e0310 100644 --- a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 +++ b/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 @@ -278,9 +278,85 @@ program test_real2 success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & na_cols, & mpi_comm_rows, mpi_comm_cols, mpi_comm_world, & +#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL + REAL_ELPA_KERNEL_GENERIC_SIMPLE) +#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ + +#ifdef WITH_REAL_GENERIC_KERNEL + REAL_ELPA_KERNEL_GENERIC) +#endif + +#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL REAL_ELPA_KERNEL_GENERIC_SIMPLE) +#endif + +#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL + REAL_ELPA_KERNEL_SSE) +#endif +#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK6) +#else +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK4) +#else +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK2) +#endif +#endif +#endif +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK6) +#else +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK4) +#else +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK2) +#endif +#endif +#endif + +#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_SSE_BLOCK2_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK2) +#endif + +#ifdef WITH_REAL_SSE_BLOCK4_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK4) +#endif + +#ifdef WITH_REAL_SSE_BLOCK6_KERNEL + REAL_ELPA_KERNEL_SSE_BLOCK6) +#endif + +#ifdef WITH_REAL_AVX_BLOCK2_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK2) +#endif + +#ifdef WITH_REAL_AVX_BLOCK4_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK4) +#endif + +#ifdef WITH_REAL_AVX_BLOCK6_KERNEL + REAL_ELPA_KERNEL_AVX_BLOCK6) +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ + +#ifdef WITH_REAL_BGP_KERNEL + REAL_ELPA_KERNEL_BGP) +#endif + +#ifdef WITH_REAL_BGQ_KERNEL + REAL_ELPA_KERNEL_BGQ) +#endif + +#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ - if (.not.(success)) then + if (.not.(success)) then write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..." #ifdef WITH_MPI call MPI_ABORT(mpi_comm_world, 1, mpierr)