Commit aebf900d authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_KNL

parents 005b7687 48760118
......@@ -127,6 +127,38 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags:
# - gpu
# script:
# - module unload gcc
# - module load gcc/4.9 cuda
# - module list
# - ./autogen.sh
# - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - export ELPA_USE_GPU=yes
# - export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
# - export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
# - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
#intel-single-precision-mpi-noomp-cuda-blocksize-jobs:
# tags:
# - gpu
# script:
# - module unload gcc
# - module load gcc/4.9 cuda
# - module list
# - ./autogen.sh
# - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - export ELPA_USE_GPU=yes
# - export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
# - export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
# - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-nompi-noomp-cuda-jobs:
tags:
- gpu
......@@ -2196,29 +2228,27 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
#real avx2 block2, complex avx2 block1 (emulated)
# todo: (pkus) I commented out the emulated tests for the process of rebase
# todo: they should be enabled again
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block1-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block1-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-special-gcov-jobs:
## tags:
......@@ -2232,17 +2262,17 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
## - /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block2, complex avx2 block1 (emulated)
#real avx2 block2, complex avx2 block1 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block2-kernel-only --with-complex-avx2-block1-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
## tags:
......@@ -2256,29 +2286,29 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block4, complex avx2 block2 (emulated)
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#real avx2 block4, complex avx2 block2 (emulated)
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-special-gcov-jobs:
## tags:
......@@ -2293,17 +2323,17 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
## # - ./test_scripts/get_coverage_summary.sh
##real avx2 block4, complex avx2 block2 (emulated)
#real avx2 block4, complex avx2 block2 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block4-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
## tags:
......@@ -2317,29 +2347,29 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
### - ./test_scripts/get_coverage_summary.sh
###real avx2 block6, complex avx2 block2
#intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
#gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export OMP_NUM_THREADS=2
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##real avx2 block6, complex avx2 block2
intel-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-special-gcov-jobs:
## tags:
......@@ -2354,17 +2384,17 @@ intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx_block6-complex-av
### - ./test_scripts/get_coverage_summary.sh
##real avx2 block6, complex avx2 block2 (emulated)
#real avx2 block6, complex avx2 block2 (emulated)
#intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
#tags:
#- emulated
#script:
#- ./autogen.sh
#- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
#- make -j 8
#- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
#- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure CFLAGS="-march=haswell -mavx2 -mfma" FCFLAGS="-O3 -march=core-avx2" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP" --with-ftimings --with-redirect --with-real-avx2-block6-kernel-only --with-complex-avx2-block2-kernel-only --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -hsw -- make check TEST_FLAGS='100 25 16'
##gfortran-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block6-complex-avx2_block2-kernel-jobs:
## tags:
......
......@@ -32,6 +32,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_determine_workload.F90 \
src/elpa2_compute.F90 \
src/elpa2_kernels/mod_fortran_interfaces.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
......@@ -48,10 +49,19 @@ libelpa@SUFFIX@_private_la_SOURCES = \
EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/elpa1_compute_complex_template.X90 \
src/elpa1_compute_real_template.X90 \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......@@ -78,7 +88,7 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
else
libelpa@SUFFIX@_private_la_SOURCES += \
src/timer_dummy.F90
src/timer_dummy.F90
endif
if WITH_GPU_VERSION
......@@ -924,18 +934,24 @@ EXTRA_DIST = \
test/Fortran/elpa_print_headers.X90 \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/elpa1_compute_real_template.X90 \
src/elpa1_compute_template.X90 \
src/elpa1_merge_systems_real_template.X90 \
src/elpa1_solve_tridi_real_template.X90 \
src/elpa1_tools_real_template.X90 \
src/elpa1_trans_ev_real_template.X90 \
src/elpa1_tridiag_real_template.X90 \
src/elpa1_compute_complex_template.X90 \
src/elpa1_tools_complex_template.X90 \
src/elpa1_trans_ev_complex_template.X90 \
src/elpa1_tridiag_complex_template.X90 \
src/elpa1_tools_template.X90 \
src/elpa1_trans_ev_template.X90 \
src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/precision_macros.h \
src/precision_macros_complex.h \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
......
......@@ -12,7 +12,7 @@
# license that conforms to the Open Source Definition (Version 1.9)
# published by the Open Source Initiative.
%define so_version 4
%define so_version 8
# OpenMP support requires an MPI implementation with MPI_THREAD_MULTIPLE support,
# which is only available for a sufficiently configured openmpi >= 1.8
......
#!/usr/bin/python
import sys
simple_tokens = [
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION",
"trans_ev_tridi_to_band_NUMBER_PRECISION",
"band_band_NUMBER_PRECISION",
"tridiag_NUMBER_PRECISION",
"trans_ev_NUMBER_PRECISION",
"solve_tridi_PRECISION",
"solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
"unpack_row_NUMBER_cpu_openmp_PRECISION",
"unpack_and_prepare_row_group_NUMBER_gpu_PRECISION",
"extract_hh_tau_NUMBER_gpu_PRECISION",
"compute_hh_dot_products_NUMBER_gpu_PRECISION",
"compute_hh_trafo_NUMBER_cpu_openmp_PRECISION",
"compute_hh_trafo_NUMBER_cpu_PRECISION",
"pack_row_group_NUMBER_gpu_PRECISION",
"pack_row_NUMBER_cpu_openmp_PRECISION",
"pack_row_NUMBER_cpu_PRECISION",
"wy_gen_PRECISION",
"wy_right_PRECISION",
"wy_left_PRECISION",
"wy_symm_PRECISION",
"merge_recursive_PRECISION",
"merge_systems_PRECISION",
"distribute_global_column_PRECISION",
"check_monotony_PRECISION",
"global_gather_PRECISION",
"resort_ev_PRECISION",
"transform_columns_PRECISION",
"solve_secular_equation_PRECISION",
"global_product_PRECISION",
"add_tmp_PRECISION",
"v_add_s_PRECISION",
]
blas_tokens = [
"PRECISION_GEMV",
"PRECISION_TRMV",
"PRECISION_GEMM",
"PRECISION_TRMM",
"PRECISION_HERK",
"PRECISION_SYRK",
"PRECISION_SYMV",
"PRECISION_SYMM",
"PRECISION_SYR2",
"PRECISION_SYR2K",
"PRECISION_GEQRF",
"PRECISION_STEDC",
"PRECISION_STEQR",
"PRECISION_LAMRG",
"PRECISION_LAMCH",
"PRECISION_LAPY2",
"PRECISION_LAED4",
"PRECISION_LAED5",
"cublas_PRECISION_GEMM",
"cublas_PRECISION_TRMM",
"cublas_PRECISION_GEMV",
]
explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
explicit_tokens_real = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("CONST_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_0_5", "0.5_rk8", "0.5_rk4"),
("CONST_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_2_0", "2.0_rk8", "2.0_rk4"),
("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
]
explicit_order = {"single":2, "double":1}
blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","single") : "C", ("complex","double") : "Z"}
def print_variant(number, precision, explicit):
for token in simple_tokens:
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
for token in explicit:
print "#define ", token[0], token[explicit_order[precision]]
def print_undefs(number, explicit):
for token in simple_tokens:
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens:
print "#undef ", token
for token in explicit:
print "#undef ", token[0]
if(sys.argv[1] == "complex"):
print "#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "double", explicit_tokens_complex)
print "#else"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
elif(sys.argv[1] == "real"):
print "#ifdef DOUBLE_PRECISION_REAL"
print_undefs("real", explicit_tokens_real)
print_variant("real", "double", explicit_tokens_real)
print "#else"
print_undefs("real", explicit_tokens_real)
print_variant("real", "single", explicit_tokens_real)
print "#endif"
else:
assert(False)
\ No newline at end of file
#!/usr/bin/python
simple_tokens = ["tridiag_complex_PRECISION",
"trans_ev_complex_PRECISION",
"solve_complex_PRECISION",
"hh_transform_complex_PRECISION",
"elpa_transpose_vectors_complex_PRECISION",
"elpa_reduce_add_vectors_complex_PRECISION",
]
blas_tokens = ["PRECISION_GEMV",
"PRECISION_TRMV",
"PRECISION_GEMM",
"PRECISION_TRMM",
"PRECISION_HERK",
"cublas_PRECISION_gemm",
"cublas_PRECISION_trmm",
"cublas_PRECISION_gemv",
]
explicit_tokens = [("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
print "#ifdef DOUBLE_PRECISION_COMPLEX"
for token in simple_tokens:
print "#define ", token, token.replace("PRECISION", "double")
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", "Z")
for token in explicit_tokens:
print "#define ", token[0], token[1]
print "#else"
for token in simple_tokens:
print "#undef ", token
for token in blas_tokens:
print "#undef ", token
for token in explicit_tokens:
print "#undef ", token[0]
for token in simple_tokens:
print "#define ", token, token.replace("PRECISION", "single")
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", "C")
for token in explicit_tokens:
print "#define ", token[0], token[2]
print "#endif"
This diff is collapsed.
......@@ -212,10 +212,14 @@ module ELPA1_COMPUTE
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#include "elpa1_compute_real_template.X90"
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
......@@ -223,7 +227,12 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define REAL_DATATYPE rk4
#include "elpa1_compute_real_template.X90"
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef REALCASE
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
......@@ -234,7 +243,12 @@ module ELPA1_COMPUTE
#define DOUBLE_PRECISION_COMPLEX 1
#define REAL_DATATYPE rk8
#define COMPLEX_DATATYPE ck8
#include "elpa1_compute_complex_template.X90"
#define COMPLEXCASE 1
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
......@@ -248,7 +262,12 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4
#define COMPLEX_DATATYPE ck4
#include "elpa1_compute_complex_template.X90"
#define COMPLEXCASE 1
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
......
......@@ -52,16 +52,39 @@
! distributed along with the original code in the file "COPYING".
#endif
#if REALCASE == 1
#include "precision_macros.h"