Commit 6a8f926a authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into elpa_interface

parents 50d0ec4b c4f2443e
This diff is collapsed.
......@@ -28,10 +28,9 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/mod_pack_unpack_real.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/mod_redist_band.F90 \
src/mod_pack_unpack_cpu.F90 \
src/mod_compute_hh_trafo.F90 \
src/aligned_mem.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_determine_workload.F90 \
......@@ -42,7 +41,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90 \
src/mod_cuda.F90 \
src/interface_c_kernel.F90 \
src/mod_pack_unpack_real_gpu.F90 \
src/mod_pack_unpack_gpu.F90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.F90 \
src/elpa_qr/elpa_pdlarfb.F90 \
......@@ -64,7 +63,15 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -161,6 +168,14 @@ endif
endif
endif
if WITH_REAL_AVX512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_2hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -182,6 +197,15 @@ endif
endif
endif
if WITH_REAL_AVX512_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -203,6 +227,14 @@ endif
endif
endif
if WITH_REAL_AVX512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -224,6 +256,14 @@ endif
endif
endif
if WITH_COMPLEX_AVX512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx512_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -245,6 +285,13 @@ endif
endif
endif
if WITH_COMPLEX_AVX512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx512_2hv_single_precision.c
endif
endif
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@
......@@ -281,6 +328,7 @@ dist_man_MANS = \
man/elpa_solve_evp_real_single.3 \
man/elpa_solve_evp_complex_double.3 \
man/elpa_solve_evp_complex_single.3 \
man/elpa_tests.1 \
man/elpa2_print_kernels.1
if WANT_SINGLE_PRECISION_REAL
......@@ -309,6 +357,7 @@ dist_files_DATA = \
test/Fortran/test_complex2.F90 \
test/Fortran/test_complex2_default.F90 \
test/Fortran/test_complex2_api.F90 \
test/Fortran/test_complex2_banded.F90 \
test/Fortran/test_complex.F90 \
test/Fortran/test_real2.F90 \
test/Fortran/test_real2_default.F90 \
......@@ -325,6 +374,7 @@ dist_files_DATA = \
test/Fortran/test_cholesky_complex.F90 \
test/Fortran/test_invert_trm_complex.F90 \
test/Fortran/test_new_interface.F90 \
test/Fortran/elpa_tests.F90 \
src/elpa2_print_kernels.F90
#end needed
......@@ -337,6 +387,7 @@ pkgconfig_DATA = @PKG_CONFIG_FILE@
# programs
bin_PROGRAMS = \
elpa_tests@SUFFIX@ \
elpa2_print_kernels@SUFFIX@
noinst_PROGRAMS = \
......@@ -351,6 +402,7 @@ noinst_PROGRAMS = \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_banded@SUFFIX@ \
elpa_driver_real@SUFFIX@ \
elpa_driver_complex@SUFFIX@ \
elpa1_real_toeplitz@SUFFIX@ \
......@@ -522,6 +574,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he
elpa2_test_real@SUFFIX@_SOURCES = test/Fortran/test_real2.F90
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
......@@ -565,6 +618,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_complex_banded@SUFFIX@_SOURCES = test/Fortran/test_complex2_banded.F90
elpa2_test_complex_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa_driver_real@SUFFIX@_SOURCES = test/Fortran/test_driver_real.F90
elpa_driver_real@SUFFIX@_LDADD = $(build_lib)
elpa_driver_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -579,6 +638,10 @@ elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90
elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib)
elpa2_print_kernels@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
elpa_tests@SUFFIX@_SOURCES = test/Fortran/elpa_tests.F90
elpa_tests@SUFFIX@_LDADD = $(build_lib)
elpa_tests@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
if WANT_SINGLE_PRECISION_REAL
elpa1_test_real_single_precision@SUFFIX@_SOURCES = test/Fortran/test_real_single.F90
elpa1_test_real_single_precision@SUFFIX@_LDADD = $(build_lib)
......@@ -746,6 +809,7 @@ check_SCRIPTS = \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa2_test_complex_banded@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
elpa1_real_toeplitz@SUFFIX@.sh \
......@@ -848,11 +912,8 @@ elpa2_compute.i: $(top_srcdir)/src/elpa2_compute.F90
elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@
mod_compute_hh_trafo.i: $(top_srcdir)/src/mod_compute_hh_trafo.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo.F90 -o $@
test_real.i: $(top_srcdir)/test/Fortran/test_real1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/Fortran/test_real1.F90 -o $@
......@@ -883,7 +944,7 @@ CLEANFILES = \
elpa2_test*\
elpa2_real* \
elpa1_real* \
*.sh \
elpa*.sh \
*.i
clean-local:
......@@ -928,6 +989,14 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/ev_tridi_band_gpu_c_v2_real_template.Xcu \
......
......@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then
AC_MSG_ERROR([Could not compile an MPI Fortran program])
fi
fi
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
......@@ -123,7 +124,7 @@ fi
dnl check which MPI binray invokes a MPI job
if test x"$with_mpi" = x"yes"; then
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun], [no])
AC_CHECK_PROGS([MPI_BINARY], [srun mpiexec.hydra mpiexec mpirun poe runjob], [no])
if test x"$MPI_BINARY" = x"no"; then
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun])
fi
......@@ -163,6 +164,7 @@ install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
#want_avx512=yes
AC_LANG([C])
......@@ -185,7 +187,7 @@ AM_CONDITIONAL([HAVE_REDIRECT],[test x"$enable_redirect" = x"yes"])
dnl build with ftimings support
AC_MSG_CHECKING(whether ELPA should be build with more detailed timing support)
AC_ARG_ENABLE([timings],
AS_HELP_STRING([--enable-timing],
AS_HELP_STRING([--enable-timings],
[more detailed timing, default no.]),
[enable_timings=yes],
[enable_timings=no])
......@@ -249,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then
fi
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
......@@ -283,7 +285,54 @@ else
install_complex_sse_block2=no
fi
dnl check whether one can compile with avx - gcc intrinsics
AC_MSG_CHECKING(whether --enable-avx is specified)
AC_ARG_ENABLE([avx],
AS_HELP_STRING([--enable-avx],
[check whether AVX kernels can be build, default yes]),
[check_avx=no],
[check_avx=yes])
AC_MSG_RESULT([$check_avx])
if test "${check_avx}" = "yes"; then
dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS
AC_MSG_CHECKING([whether we can compile AVX gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d a1_1 = _mm256_load_pd(q);
return 0;
}
])],
[can_compile_avx=yes],
[can_compile_avx=no]
)
AC_MSG_RESULT([${can_compile_avx}])
else
can_compile_avx=no
fi
#if test "${can_compile_avx}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d a1_1 = _mm256_load_pd(q);
# return 0;
# }
# ])],
# [can_compile_avx=yes],
# [can_compile_avx=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx}])
# if test "${can_compile_avx}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
# fi
#fi
dnl first pass: try with specified CFLAGS and CXXFLAGS
......@@ -343,7 +392,7 @@ AC_ARG_ENABLE([avx2],
AC_MSG_RESULT([$check_avx2])
if test "${check_avx2}" = "yes"; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_MSG_CHECKING([whether we can compile AVX2 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
......@@ -361,6 +410,77 @@ else
can_compile_avx2=no
fi
#if test "${can_compile_avx2}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX2 gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d q1 = _mm256_load_pd(q);
# __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
# return 0;
# }
# ])],
# [can_compile_avx2=yes],
# [can_compile_avx2=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx2}])
# if test "${can_compile_avx2}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX2!])
# fi
#fi
AC_MSG_CHECKING(whether --enable-avx512 is specified)
AC_ARG_ENABLE([avx512],
AS_HELP_STRING([--enable-avx512],
[check whether AVX512 kernels can be build, default yes]),
[check_avx512=no],
[check_avx512=yes])
AC_MSG_RESULT([$check_avx512])
if test "${check_avx512}" = "yes"; then
AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m512d q1 = _mm512_load_pd(q);
__m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx512=yes],
[can_compile_avx512=no]
)
AC_MSG_RESULT([${can_compile_avx512}])
else
can_compile_avx512=no
fi
#if test "${can_compile_avx512}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m512d q1 = _mm512_load_pd(q);
# __m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
# return 0;
# }
# ])],
# [can_compile_avx512=yes],
# [can_compile_avx512=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx512}])
# if test "${can_compile_avx512}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX512!])
# fi
#fi
dnl if test "${can_compile_avx2}" = "yes" ; then
dnl AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
dnl AC_LANG_PUSH([C++])
......@@ -415,6 +535,23 @@ else
install_complex_avx2_block2=no
fi
if test "${can_compile_avx512}" = "yes" ; then
install_real_avx512_block2=yes
install_real_avx512_block4=yes
install_real_avx512_block6=yes
install_complex_avx512_block1=yes
install_complex_avx512_block2=yes
else
install_real_avx512_block2=no
install_real_avx512_block4=no
install_real_avx512_block6=no
install_complex_avx512_block1=no
install_complex_avx512_block2=no
fi
AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"])
if test x"${can_compile_sse_assembly}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU])
......@@ -432,6 +569,10 @@ AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
if test x"${can_compile_avx2}" = x"yes" ; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX512],[test x"$can_compile_avx512" = x"yes"])
if test x"${can_compile_avx512}" = x"yes" ; then
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
fi
dnl set the AVX optimization flags if this option is specified
AC_MSG_CHECKING(whether AVX optimization flags should be set automatically)
......@@ -515,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then
else
dnl first check blas
AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
AC_SEARCH_LIBS([dgemm],[openblas satlas blas],[have_blas=yes],[have_blas=no])
AC_MSG_CHECKING([whether we can link a program with a blas lib])
AC_MSG_RESULT([${have_blas}])
......@@ -616,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi
dnl check whether BAND_TO_FULL_BLOCKING is set
use_band_to_full_blocking=yes
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--enable-band-to-full-blocking],
[build ELPA2 with blocking in band_to_full (default: enabled)])],
want_band_to_full_blocking="yes", want_to_full_blocking="no")
AC_MSG_RESULT([${want_band_to_full_blocking}])
if test x"${enableval}" = x"no" ; then
use_band_to_full_blocking=no
fi
AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$use_band_to_full_blocking" = x"yes"])
if test x"${use_band_to_full_blocking}" = x"yes"; then
AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
fi
dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/"
......@@ -635,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=value],
[use compute capability "value" for GPU version (default sm_35)])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
fi
fi
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -O2 -I$CUDA_INSTALL_PATH/include"
CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
......@@ -801,6 +978,16 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx2-block4-kernel-only],[real-avx2-blo
dnl real-avx2-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx2-block6-kernel-only],[real-avx2-block6-kernel],[install_real_avx2_block6])
dnl real-avx512-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block2-kernel-only],[real-avx512-block2-kernel],[install_real_avx512_block2])
dnl real-avx512-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block4-kernel-only],[real-avx512-block4-kernel],[install_real_avx512_block4])
dnl real-avx512-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block6-kernel-only],[real-avx512-block6-kernel],[install_real_avx512_block6])
dnl complex kernels
dnl generic kernel
......@@ -836,6 +1023,13 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx2-block1-kernel-only],[complex
dnl complex-avx2-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx2-block2-kernel-only],[complex-avx2-block2-kernel],[install_complex_avx2_block2])
dnl complex-avx512-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx512-block1-kernel-only],[complex-avx512-block1-kernel],[install_complex_avx512_block1])
dnl complex-avx512-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx512-block2-kernel-only],[complex-avx512-block2-kernel],[install_complex_avx512_block2])
dnl set the conditionals according to the previous tests
......@@ -923,6 +1117,21 @@ if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK2_KERNEL],[test x"$install_real_avx512_block2" = x"yes"])
if test x"${install_real_avx512_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK2_KERNEL],[1],[can use real_avx512_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK4_KERNEL],[test x"$install_real_avx512_block4" = x"yes"])
if test x"${install_real_avx512_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK4_KERNEL],[1],[can use real_avx512_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK6_KERNEL],[test x"$install_real_avx512_block6" = x"yes"])
if test x"${install_real_avx512_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK6_KERNEL],[1],[can use real_avx512_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
......@@ -953,6 +1162,16 @@ if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX512_BLOCK1_KERNEL],[test x"$install_complex_avx512_block1" = x"yes"])
if test x"${install_complex_avx512_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX512_BLOCK1_KERNEL],[1],[can use complex_avx512_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX512_BLOCK2_KERNEL],[test x"$install_complex_avx512_block2" = x"yes"])
if test x"${install_complex_avx512_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX512_BLOCK2_KERNEL],[1],[can use complex_avx512_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
......@@ -1063,6 +1282,20 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in
])
AC_MSG_CHECKING([if workaround for Intel's broken preprocessor is needed])
if test x"$FC" = x"mpiifort" ; then
need_manual_cpp=yes
fi
if test x"$FC" = x"ifort" ; then
need_manual_cpp=yes
fi
if test x"$need_manual_cpp" = x"yes" ; then
AC_MSG_RESULT([yes])
FC="\$(top_srcdir)/manual_cpp $FC"
else
AC_MSG_RESULT([no])
fi
AC_OUTPUT
if test "${can_compile_avx}" = "no" ; then
......@@ -1075,6 +1308,12 @@ if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
# fi
fi
if test "${can_compile_avx512}" = "no" ; then
# if test x"${want_avx512}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX512 instructions])
# fi
fi
if test "${can_compile_sse}" = "no" ; then
AC_MSG_WARN([Could not compile SSE instructions])
......
......@@ -12,9 +12,12 @@
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK2 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK4 16
#define ELPA2_REAL_KERNEL_AVX512_BLOCK6 17
#define ELPA2_REAL_KERNEL_GPU 18
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 18
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
......@@ -27,7 +30,9 @@
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2 13
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
#define ELPA2_COMPLEX_KERNEL_GPU 14
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 14
......@@ -25,9 +25,17 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
install_real_avx_block2=no
install_real_avx_block4=no
install_real_avx_block6=no
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_real_avx512_block2=no
install_real_avx512_block4=no
install_real_avx512_block6=no
want_sse=no
want_avx=no
want_avx2=no
want_avx512=no
install_gpu=no
use_specific_real_kernel=yes
......@@ -42,6 +50,15 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
AC_MSG_NOTICE([$1 set. Also avx_block2 is needed])
install_real_avx_block2=yes
fi
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also avx2_block2 is needed])
install_real_avx2_block2=yes
fi
if test x"${install_real_avx512_block4}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also avx512_block2 is needed])
install_real_avx512_block2=yes
fi
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also sse_block2 is needed])
AC_MSG_NOTICE([$1 set. Also sse_block4 is needed])
......@@ -54,6 +71,19 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
install_real_avx_block4=yes
install_real_avx_block2=yes
fi
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also avx2_block2 is needed])