Commit f59e2758 authored by Pavel Kus's avatar Pavel Kus
Browse files

elpa1_template splitted

some intent (in) or (out) added
some comments added

Conflicts:
	src/elpa1.F90
	src/elpa1_compute_real_template.X90
parent 1605e66c
......@@ -2132,184 +2132,186 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
#real avx2 block2, complex avx2 block1 (emulated)
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block1-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-special-gcov-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
# - make -j 8
# - export OMP_NUM_THREADS=1
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
# # - ./test_scripts/get_coverage_summary.sh
#real avx2 block2, complex avx2 block1 (emulated)
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
# # - ./test_scripts/get_coverage_summary.sh
#real avx2 block4, complex avx2 block2 (emulated)
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-special-gcov-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
# - make -j 8
# - export OMP_NUM_THREADS=1
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
# # - ./test_scripts/get_coverage_summary.sh
#real avx2 block4, complex avx2 block2 (emulated)
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## - ./test_scripts/get_coverage_summary.sh
##real avx2 block6, complex avx2 block2
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-special-gcov-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
# - make -j 8
# - export OMP_NUM_THREADS=1
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## - ./test_scripts/get_coverage_summary.sh
#real avx2 block6, complex avx2 block2 (emulated)
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
# tags:
# - emulated
# script:
# - ./autogen.sh
# - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## - ./test_scripts/get_coverage_summary.sh
# todo: (pkus) I commented out the emulated tests for the process of rebase
# todo: they should be enabled again
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block1-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-special-gcov-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
## - make -j 8
## - export OMP_NUM_THREADS=1
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block2, complex avx2 block1 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
## - make -j 8
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block4, complex avx2 block2 (emulated)
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-special-gcov-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
## - make -j 8
## - export OMP_NUM_THREADS=1
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block4, complex avx2 block2 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
## - make -j 8
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
### - ./test_scripts/get_coverage_summary.sh
###real avx2 block6, complex avx2 block2
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-special-gcov-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
## - make -j 8
## - export OMP_NUM_THREADS=1
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
### - ./test_scripts/get_coverage_summary.sh
##real avx2 block6, complex avx2 block2 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
## tags:
## - emulated
## script:
## - ./autogen.sh
## - ./configure FC=mpif90 CFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
## - make -j 8
## - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
### - ./test_scripts/get_coverage_summary.sh
# testing of kernel settinv via environment
......
......@@ -844,6 +844,11 @@ EXTRA_DIST = \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/elpa1_compute_real_template.X90 \
src/elpa1_merge_systems_real_template.X90 \
src/elpa1_solve_tridi_real_template.X90 \
src/elpa1_tools_real_template.X90 \
src/elpa1_trans_ev_real_template.X90 \
src/elpa1_tridiag_real_template.X90 \
src/elpa1_compute_complex_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
......
......@@ -215,7 +215,6 @@ module ELPA1_COMPUTE
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#include "precision_macros.h"
#include "elpa1_compute_real_template.X90"
#undef DOUBLE_PRECISION_REAL
......@@ -227,7 +226,6 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define REAL_DATATYPE rk4
#include "precision_macros.h"
#include "elpa1_compute_real_template.X90"
#undef DOUBLE_PRECISION_REAL
......
......@@ -52,2879 +52,11 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_tridiag_real_PRECISSION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
! (like Scalapack Routine PDSYTRD)
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! d(na) Diagonal elements (returned), identical on all processors
!
! e(na) Off-Diagonal elements (returned), identical on all processors
!
! tau(na) Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=REAL_DATATYPE) :: d(na), e(na), tau(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols)
#endif
integer(kind=ik), parameter :: max_stored_rows = 32
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, nstor
integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter
integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real(kind=REAL_DATATYPE) :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
real(kind=REAL_DATATYPE), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("tridiag_real" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = (totalblocks-1)/np_cols + 1
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop
endif
allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vr "//errorMessage
stop
endif
allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating ur "//errorMessage
stop
endif
allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vc "//errorMessage
stop
endif
allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uc "//errorMessage
stop
endif
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating ur_p "//errorMessage
stop
endif
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uc_p "//errorMessage
stop
endif
#endif
tmp = 0
vr = 0
ur = 0
vc = 0
uc = 0
allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vur "//errorMessage
stop
endif
allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uvc "//errorMessage
stop
endif
! if (useGPU) then
! allocate(vr_dev(max_local_rows))
! allocate(ur_dev(max_local_rows))
! allocate(vc_dev(max_local_cols))
! allocate(uc_dev(max_local_cols))
! allocate(vur_dev(max_local_rows,2*max_stored_rows))
! allocate(uvc_dev(max_local_cols,2*max_stored_rows))
! endif
d(:) = 0
e(:) = 0
tau(:) = 0
nstor = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
! if (useGPU) then
! allocate(a_dev(lda,na))
! a_dev = a
! endif
do istep=na,3,-1
! Calculate number of local rows and columns of the still remaining matrix
! on the local processor
l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
! Calculate vector for Householder transformation on all procs
! owning column istep
if(my_pcol==pcol(istep, nblk, np_cols)) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
! if (useGPU) then
! vr(1:l_rows) = a_dev(1:l_rows,l_cols+1)
! else
vr(1:l_rows) = a(1:l_rows,l_cols+1)
! endif
if(nstor>0 .and. l_rows>0) then
call M_PRECISSION_GEMV('N', l_rows, 2*nstor, M_CONST_1_0, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), M_CONST_1_0, vr, 1)
endif
if(my_prow==prow(istep-1, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
aux1(2) = vr(l_rows)
else
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(aux1, aux2, 2, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call M_hh_transform_real_PRECISSION(vrl, vnorm2, xf, tau(istep))
! Scale vr and store Householder vector for back transformation
vr(1:l_rows) = vr(1:l_rows) * xf
if(my_prow==prow(istep-1, nblk, np_rows)) then
vr(l_rows) = 1.
e(istep-1) = vrl