Commit 5afd4d63 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_KNL

parents a6235503 2fb06c49
...@@ -58,7 +58,8 @@ intel-double-precision-mpi-noomp-cuda-jobs: ...@@ -58,7 +58,8 @@ intel-double-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --disable-assumed-size - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --disable-assumed-size
- make -j 8 - make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; } - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-double-precision-nompi-noomp-cuda-jobs: intel-double-precision-nompi-noomp-cuda-jobs:
...@@ -72,7 +73,8 @@ intel-double-precision-nompi-noomp-cuda-jobs: ...@@ -72,7 +73,8 @@ intel-double-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --with-mpi=0 - ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --with-mpi=0
- make -j 8 - make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; } - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
#gfortran-double-precision-mpi-noomp-jobs: #gfortran-double-precision-mpi-noomp-jobs:
# tags: # tags:
...@@ -125,7 +127,22 @@ intel-single-precision-mpi-noomp-cuda-jobs: ...@@ -125,7 +127,22 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8 - make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; } - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-mpi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs: #intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags: # tags:
...@@ -170,7 +187,22 @@ intel-single-precision-nompi-noomp-cuda-jobs: ...@@ -170,7 +187,22 @@ intel-single-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0 - ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8 - make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; } - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-nompi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#gfortran-single-precision-mpi-noomp-jobs: #gfortran-single-precision-mpi-noomp-jobs:
...@@ -213,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs: ...@@ -213,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; } - make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP" --with-mpi=no FC=ifort
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 50 32' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-assumed-size-jobs: intel-double-precision-nompi-noomp-assumed-size-jobs:
tags: tags:
- cpu - cpu
...@@ -376,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs: ...@@ -376,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; } - make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 500 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-assumed-size-jobs: intel-single-precision-mpi-openmp-assumed-size-jobs:
tags: tags:
- cpu - cpu
......
...@@ -26,6 +26,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \ ...@@ -26,6 +26,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_precision.F90 \ src/mod_precision.F90 \
src/mod_mpi.F90 \ src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \ src/mod_mpi_stubs.F90 \
src/mod_redist_band.F90 \
src/mod_pack_unpack_real.F90 \ src/mod_pack_unpack_real.F90 \
src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \ src/mod_compute_hh_trafo_complex.F90 \
...@@ -63,6 +64,11 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -63,6 +64,11 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/precision_macros.h src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la lib_LTLIBRARIES = libelpa@SUFFIX@.la
...@@ -347,6 +353,7 @@ dist_files_DATA = \ ...@@ -347,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_complex2.F90 \ test/Fortran/test_complex2.F90 \
test/Fortran/test_complex2_default.F90 \ test/Fortran/test_complex2_default.F90 \
test/Fortran/test_complex2_api.F90 \ test/Fortran/test_complex2_api.F90 \
test/Fortran/test_complex2_banded.F90 \
test/Fortran/test_complex.F90 \ test/Fortran/test_complex.F90 \
test/Fortran/test_real2.F90 \ test/Fortran/test_real2.F90 \
test/Fortran/test_real2_default.F90 \ test/Fortran/test_real2_default.F90 \
...@@ -388,6 +395,7 @@ noinst_PROGRAMS = \ ...@@ -388,6 +395,7 @@ noinst_PROGRAMS = \
elpa2_test_complex_default@SUFFIX@ \ elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \ elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \ elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_banded@SUFFIX@ \
elpa_driver_real@SUFFIX@ \ elpa_driver_real@SUFFIX@ \
elpa_driver_complex@SUFFIX@ \ elpa_driver_complex@SUFFIX@ \
elpa1_real_toeplitz@SUFFIX@ \ elpa1_real_toeplitz@SUFFIX@ \
...@@ -553,6 +561,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he ...@@ -553,6 +561,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he
elpa2_test_real@SUFFIX@_SOURCES = test/Fortran/test_real2.F90 elpa2_test_real@SUFFIX@_SOURCES = test/Fortran/test_real2.F90
elpa2_test_real@SUFFIX@_LDADD = $(build_lib) elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules elpa2_test_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90 EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
...@@ -596,6 +605,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib) ...@@ -596,6 +605,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules elpa2_test_complex_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90 EXTRA_elpa2_test_complex_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_complex_banded@SUFFIX@_SOURCES = test/Fortran/test_complex2_banded.F90
elpa2_test_complex_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa_driver_real@SUFFIX@_SOURCES = test/Fortran/test_driver_real.F90 elpa_driver_real@SUFFIX@_SOURCES = test/Fortran/test_driver_real.F90
elpa_driver_real@SUFFIX@_LDADD = $(build_lib) elpa_driver_real@SUFFIX@_LDADD = $(build_lib)
elpa_driver_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules elpa_driver_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
...@@ -777,6 +792,7 @@ check_SCRIPTS = \ ...@@ -777,6 +792,7 @@ check_SCRIPTS = \
elpa2_test_real_api@SUFFIX@.sh \ elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \ elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \ elpa2_test_complex_api@SUFFIX@.sh \
elpa2_test_complex_banded@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \ elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \ elpa_driver_complex@SUFFIX@.sh \
elpa1_real_toeplitz@SUFFIX@.sh \ elpa1_real_toeplitz@SUFFIX@.sh \
...@@ -913,7 +929,7 @@ CLEANFILES = \ ...@@ -913,7 +929,7 @@ CLEANFILES = \
elpa2_test*\ elpa2_test*\
elpa2_real* \ elpa2_real* \
elpa1_real* \ elpa1_real* \
*.sh \ elpa*.sh \
*.i *.i
clean-local: clean-local:
...@@ -958,6 +974,11 @@ EXTRA_DIST = \ ...@@ -958,6 +974,11 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/elpa_qr/elpa_qrkernels.X90 \ src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \ src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/ev_tridi_band_gpu_c_v2_real_template.Xcu \ src/ev_tridi_band_gpu_c_v2_real_template.Xcu \
......
...@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then ...@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then
AC_MSG_ERROR([Could not compile an MPI Fortran program]) AC_MSG_ERROR([Could not compile an MPI Fortran program])
fi fi
fi fi
if test x"${enable_openmp}" = x"yes"; then if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then if test "$ac_cv_prog_fc_openmp" = unsupported; then
...@@ -250,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then ...@@ -250,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then
fi fi
dnl check whether on can compile with sse-gcc intrinsics dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C) AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> #include <x86intrin.h>
int main(int argc, char **argv){ int main(int argc, char **argv){
...@@ -655,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then ...@@ -655,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then
else else
dnl first check blas dnl first check blas
AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no]) AC_SEARCH_LIBS([dgemm],[openblas satlas blas],[have_blas=yes],[have_blas=no])
AC_MSG_CHECKING([whether we can link a program with a blas lib]) AC_MSG_CHECKING([whether we can link a program with a blas lib])
AC_MSG_RESULT([${have_blas}]) AC_MSG_RESULT([${have_blas}])
...@@ -756,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then ...@@ -756,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables]) AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi fi
dnl check whether BAND_TO_FULL_BLOCKING is set
use_band_to_full_blocking=yes
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--enable-band-to-full-blocking],
[build ELPA2 with blocking in band_to_full (default: enabled)])],
want_band_to_full_blocking="yes", want_to_full_blocking="no")
AC_MSG_RESULT([${want_band_to_full_blocking}])
if test x"${enableval}" = x"no" ; then
use_band_to_full_blocking=no
fi
AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$use_band_to_full_blocking" = x"yes"])
if test x"${use_band_to_full_blocking}" = x"yes"; then
AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
fi
dnl check whether GPU version is requested dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/" #CUDA_INSTALL_PATH="/usr/local/cuda/"
...@@ -775,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix ...@@ -775,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto]) [CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
dnl setup nvcc flags and use them in later tests dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=value],
[use compute capability "value" for GPU version (default sm_35)])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
fi
fi
if test x"${want_gpu}" = x"yes" ; then if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C]) AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -O2 -I$CUDA_INSTALL_PATH/include" CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64" LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS" NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc" NVCC="nvcc"
...@@ -1245,6 +1282,20 @@ AC_CONFIG_FILES([ ...@@ -1245,6 +1282,20 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in ${PKG_CONFIG_FILE}:elpa.pc.in
]) ])
AC_MSG_CHECKING([if workaround for Intel's broken preprocessor is needed])
if test x"$FC" = x"mpiifort" ; then
need_manual_cpp=yes
fi
if test x"$FC" = x"ifort" ; then
need_manual_cpp=yes
fi
if test x"$need_manual_cpp" = x"yes" ; then
AC_MSG_RESULT([yes])
FC="\$(top_srcdir)/manual_cpp $FC"
else
AC_MSG_RESULT([no])
fi
AC_OUTPUT AC_OUTPUT
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
......
.TH "elpa_solve_evp_complex_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*- .TH "elpa_solve_evp_complex_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l .ad l
.nh .nh
.SH NAME .SH NAME
...@@ -12,7 +12,7 @@ use elpa1 ...@@ -12,7 +12,7 @@ use elpa1
use elpa2 use elpa2
.br .br
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)" .RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br .br
.RI " " .RI " "
.br .br
...@@ -47,6 +47,8 @@ use elpa2 ...@@ -47,6 +47,8 @@ use elpa2
.br .br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure" .RI "logical \fBsuccess\fP: return value indicating success or failure"
.br .br
.SS C INTERFACE .SS C INTERFACE
...@@ -55,7 +57,7 @@ use elpa2 ...@@ -55,7 +57,7 @@ use elpa2
#include <complex.h> #include <complex.h>
.br .br
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);" .RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br .br
.RI " " .RI " "
.br .br
...@@ -89,6 +91,9 @@ use elpa2 ...@@ -89,6 +91,9 @@ use elpa2
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver" .RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br .br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) .RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION .SH DESCRIPTION
......
.TH "elpa_solve_evp_complex_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*- .TH "elpa_solve_evp_complex_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l .ad l
.nh .nh
.SH NAME .SH NAME
...@@ -12,7 +12,7 @@ use elpa1 ...@@ -12,7 +12,7 @@ use elpa1
use elpa2 use elpa2
.br .br
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)" .RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br .br
.RI " " .RI " "
.br .br
...@@ -47,6 +47,8 @@ use elpa2 ...@@ -47,6 +47,8 @@ use elpa2
.br .br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure" .RI "logical \fBsuccess\fP: return value indicating success or failure"
.br .br
.SS C INTERFACE .SS C INTERFACE
...@@ -55,7 +57,7 @@ use elpa2 ...@@ -55,7 +57,7 @@ use elpa2
#include <complex.h> #include <complex.h>
.br .br
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);" .RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br .br
.RI " " .RI " "
.br .br
...@@ -89,6 +91,9 @@ use elpa2 ...@@ -89,6 +91,9 @@ use elpa2
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver" .RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br .br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) .RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION .SH DESCRIPTION
......
.TH "elpa_solve_evp_real_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*- .TH "elpa_solve_evp_real_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l .ad l
.nh .nh
.SH NAME .SH NAME
...@@ -12,7 +12,7 @@ use elpa1 ...@@ -12,7 +12,7 @@ use elpa1
use elpa2 use elpa2
.br .br
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)" .RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br .br
.RI " " .RI " "
.br .br
...@@ -49,13 +49,15 @@ use elpa2 ...@@ -49,13 +49,15 @@ use elpa2
.br .br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure" .RI "logical \fBsuccess\fP: return value indicating success or failure"
.br .br
.SS C INTERFACE .SS C INTERFACE
#include "elpa.h" #include "elpa.h"
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);" .RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br .br
.RI " " .RI " "
.br .br
...@@ -92,6 +94,8 @@ use elpa2 ...@@ -92,6 +94,8 @@ use elpa2
.br .br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) .RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION .SH DESCRIPTION
......
.TH "elpa_solve_evp_real_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*- .TH "elpa_solve_evp_real_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l .ad l
.nh .nh
.SH NAME .SH NAME
...@@ -12,7 +12,7 @@ use elpa1 ...@@ -12,7 +12,7 @@ use elpa1
use elpa2 use elpa2
.br .br
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)" .RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br .br
.RI " " .RI " "
.br .br
...@@ -49,13 +49,15 @@ use elpa2 ...@@ -49,13 +49,15 @@ use elpa2
.br .br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix "
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure" .RI "logical \fBsuccess\fP: return value indicating success or failure"
.br .br
.SS C INTERFACE .SS C INTERFACE
#include "elpa.h" #include "elpa.h"
.br .br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);" .RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br .br
.RI " " .RI " "
.br .br
...@@ -92,6 +94,8 @@ use elpa2 ...@@ -92,6 +94,8 @@ use elpa2
.br .br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not" .RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br .br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) .RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION .SH DESCRIPTION
......