Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
6a8f926a
Commit
6a8f926a
authored
Mar 30, 2017
by
Andreas Marek
Browse files
Merge branch 'master' into elpa_interface
parents
50d0ec4b
c4f2443e
Changes
105
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
6a8f926a
This diff is collapsed.
Click to expand it.
Makefile.am
View file @
6a8f926a
...
...
@@ -28,10 +28,9 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_precision.F90
\
src/mod_mpi.F90
\
src/mod_mpi_stubs.F90
\
src/mod_pack_unpack_real.F90
\
src/mod_compute_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_complex.F90
\
src/mod_pack_unpack_complex.F90
\
src/mod_redist_band.F90
\
src/mod_pack_unpack_cpu.F90
\
src/mod_compute_hh_trafo.F90
\
src/aligned_mem.F90
\
src/elpa1_compute_private.F90
\
src/elpa2_determine_workload.F90
\
...
...
@@ -42,7 +41,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90
\
src/mod_cuda.F90
\
src/interface_c_kernel.F90
\
src/mod_pack_unpack_
real_
gpu.F90
\
src/mod_pack_unpack_gpu.F90
\
src/elpa_qr/qr_utils.F90
\
src/elpa_qr/elpa_qrkernels.F90
\
src/elpa_qr/elpa_pdlarfb.F90
\
...
...
@@ -64,7 +63,15 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo.X90
\
src/redist_band.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_invert_trm.X90
\
src/elpa_multiply_a_b.X90
\
src/elpa_solve_tridi.X90
\
src/precision_macros.h
lib_LTLIBRARIES
=
libelpa@SUFFIX@.la
...
...
@@ -161,6 +168,14 @@ endif
endif
endif
if
WITH_REAL_AVX512_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_2hv_single_precision.c
endif
endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -182,6 +197,15 @@ endif
endif
endif
if
WITH_REAL_AVX512_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
endif
endif
if
WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -203,6 +227,14 @@ endif
endif
endif
if
WITH_REAL_AVX512_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
endif
endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -224,6 +256,14 @@ endif
endif
endif
if
WITH_COMPLEX_AVX512_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx512_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx512_1hv_single_precision.c
endif
endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -245,6 +285,13 @@ endif
endif
endif
if
WITH_COMPLEX_AVX512_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx512_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx512_2hv_single_precision.c
endif
endif
.cu.lo
:
NVCC
=
"
$(NVCC)
"
libtool
--mode
=
compile
--tag
=
CC
$(top_srcdir)
/nvcc_wrap
$(NVCCFLAGS)
$(LDFLAGS)
-I
$(top_builddir)
/
-I
$(top_srcdir)
/
-c
$<
-o
$@
...
...
@@ -281,6 +328,7 @@ dist_man_MANS = \
man/elpa_solve_evp_real_single.3
\
man/elpa_solve_evp_complex_double.3
\
man/elpa_solve_evp_complex_single.3
\
man/elpa_tests.1
\
man/elpa2_print_kernels.1
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -309,6 +357,7 @@ dist_files_DATA = \
test
/Fortran/test_complex2.F90
\
test
/Fortran/test_complex2_default.F90
\
test
/Fortran/test_complex2_api.F90
\
test
/Fortran/test_complex2_banded.F90
\
test
/Fortran/test_complex.F90
\
test
/Fortran/test_real2.F90
\
test
/Fortran/test_real2_default.F90
\
...
...
@@ -325,6 +374,7 @@ dist_files_DATA = \
test
/Fortran/test_cholesky_complex.F90
\
test
/Fortran/test_invert_trm_complex.F90
\
test
/Fortran/test_new_interface.F90
\
test
/Fortran/elpa_tests.F90
\
src/elpa2_print_kernels.F90
#end needed
...
...
@@ -337,6 +387,7 @@ pkgconfig_DATA = @PKG_CONFIG_FILE@
# programs
bin_PROGRAMS
=
\
elpa_tests@SUFFIX@
\
elpa2_print_kernels@SUFFIX@
noinst_PROGRAMS
=
\
...
...
@@ -351,6 +402,7 @@ noinst_PROGRAMS = \
elpa2_test_complex_default@SUFFIX@
\
elpa2_test_complex_api@SUFFIX@
\
elpa2_test_complex_api@SUFFIX@
\
elpa2_test_complex_banded@SUFFIX@
\
elpa_driver_real@SUFFIX@
\
elpa_driver_complex@SUFFIX@
\
elpa1_real_toeplitz@SUFFIX@
\
...
...
@@ -522,6 +574,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he
elpa2_test_real@SUFFIX@
_SOURCES
=
test
/Fortran/test_real2.F90
elpa2_test_real@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_test_real@SUFFIX@
_LDFLAGS
=
-static
elpa2_test_real@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
...
...
@@ -565,6 +618,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_api@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_api@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa2_test_complex_banded@SUFFIX@
_SOURCES
=
test
/Fortran/test_complex2_banded.F90
elpa2_test_complex_banded@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_test_complex_banded@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_banded@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa_driver_real@SUFFIX@
_SOURCES
=
test
/Fortran/test_driver_real.F90
elpa_driver_real@SUFFIX@
_LDADD
=
$(build_lib)
elpa_driver_real@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
...
...
@@ -579,6 +638,10 @@ elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90
elpa2_print_kernels@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_print_kernels@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
elpa_tests@SUFFIX@
_SOURCES
=
test
/Fortran/elpa_tests.F90
elpa_tests@SUFFIX@
_LDADD
=
$(build_lib)
elpa_tests@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
if
WANT_SINGLE_PRECISION_REAL
elpa1_test_real_single_precision@SUFFIX@
_SOURCES
=
test
/Fortran/test_real_single.F90
elpa1_test_real_single_precision@SUFFIX@
_LDADD
=
$(build_lib)
...
...
@@ -746,6 +809,7 @@ check_SCRIPTS = \
elpa2_test_real_api@SUFFIX@.sh
\
elpa2_test_real_banded@SUFFIX@.sh
\
elpa2_test_complex_api@SUFFIX@.sh
\
elpa2_test_complex_banded@SUFFIX@.sh
\
elpa_driver_real@SUFFIX@.sh
\
elpa_driver_complex@SUFFIX@.sh
\
elpa1_real_toeplitz@SUFFIX@.sh
\
...
...
@@ -848,11 +912,8 @@ elpa2_compute.i: $(top_srcdir)/src/elpa2_compute.F90
elpa2_kernels_real.i
:
$(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/elpa2_kernels/elpa2_kernels_real.F90
-o
$@
mod_compute_hh_trafo_real.i
:
$(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/mod_compute_hh_trafo_real.F90
-o
$@
mod_compute_hh_trafo_complex.i
:
$(top_srcdir)/src/mod_compute_hh_trafo_complex.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/mod_compute_hh_trafo_complex.F90
-o
$@
mod_compute_hh_trafo.i
:
$(top_srcdir)/src/mod_compute_hh_trafo.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/mod_compute_hh_trafo.F90
-o
$@
test_real.i
:
$(top_srcdir)/test/Fortran/test_real1.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/test/Fortran/test_real1.F90
-o
$@
...
...
@@ -883,7 +944,7 @@ CLEANFILES = \
elpa2_test
*
\
elpa2_real
*
\
elpa1_real
*
\
*
.sh
\
elpa
*
.sh
\
*
.i
clean-local
:
...
...
@@ -928,6 +989,14 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/redist_band.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_invert_trm.X90
\
src/elpa_multiply_a_b.X90
\
src/elpa_solve_tridi.X90
\
src/elpa_qr/elpa_qrkernels.X90
\
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
\
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
\
...
...
configure.ac
View file @
6a8f926a
...
...
@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then
AC_MSG_ERROR([Could not compile an MPI Fortran program])
fi
fi
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
...
...
@@ -123,7 +124,7 @@ fi
dnl check which MPI binray invokes a MPI job
if test x"$with_mpi" = x"yes"; then
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob
srun
], [no])
AC_CHECK_PROGS([MPI_BINARY], [
srun
mpiexec.hydra mpiexec mpirun poe runjob], [no])
if test x"$MPI_BINARY" = x"no"; then
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun])
fi
...
...
@@ -163,6 +164,7 @@ install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
#want_avx512=yes
AC_LANG([C])
...
...
@@ -185,7 +187,7 @@ AM_CONDITIONAL([HAVE_REDIRECT],[test x"$enable_redirect" = x"yes"])
dnl build with ftimings support
AC_MSG_CHECKING(whether ELPA should be build with more detailed timing support)
AC_ARG_ENABLE([timings],
AS_HELP_STRING([--enable-timing],
AS_HELP_STRING([--enable-timing
s
],
[more detailed timing, default no.]),
[enable_timings=yes],
[enable_timings=no])
...
...
@@ -249,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then
fi
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_MSG_CHECKING(whether we can compile SSE
3
with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
...
...
@@ -283,7 +285,54 @@ else
install_complex_sse_block2=no
fi
dnl check whether one can compile with avx - gcc intrinsics
AC_MSG_CHECKING(whether --enable-avx is specified)
AC_ARG_ENABLE([avx],
AS_HELP_STRING([--enable-avx],
[check whether AVX kernels can be build, default yes]),
[check_avx=no],
[check_avx=yes])
AC_MSG_RESULT([$check_avx])
if test "${check_avx}" = "yes"; then
dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS
AC_MSG_CHECKING([whether we can compile AVX gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d a1_1 = _mm256_load_pd(q);
return 0;
}
])],
[can_compile_avx=yes],
[can_compile_avx=no]
)
AC_MSG_RESULT([${can_compile_avx}])
else
can_compile_avx=no
fi
#if test "${can_compile_avx}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d a1_1 = _mm256_load_pd(q);
# return 0;
# }
# ])],
# [can_compile_avx=yes],
# [can_compile_avx=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx}])
# if test "${can_compile_avx}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
# fi
#fi
dnl first pass: try with specified CFLAGS and CXXFLAGS
...
...
@@ -343,7 +392,7 @@ AC_ARG_ENABLE([avx2],
AC_MSG_RESULT([$check_avx2])
if test "${check_avx2}" = "yes"; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_MSG_CHECKING([whether we can compile AVX2
gcc
intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
...
...
@@ -361,6 +410,77 @@ else
can_compile_avx2=no
fi
#if test "${can_compile_avx2}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX2 gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d q1 = _mm256_load_pd(q);
# __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
# return 0;
# }
# ])],
# [can_compile_avx2=yes],
# [can_compile_avx2=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx2}])
# if test "${can_compile_avx2}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX2!])
# fi
#fi
AC_MSG_CHECKING(whether --enable-avx512 is specified)
AC_ARG_ENABLE([avx512],
AS_HELP_STRING([--enable-avx512],
[check whether AVX512 kernels can be build, default yes]),
[check_avx512=no],
[check_avx512=yes])
AC_MSG_RESULT([$check_avx512])
if test "${check_avx512}" = "yes"; then
AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m512d q1 = _mm512_load_pd(q);
__m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx512=yes],
[can_compile_avx512=no]
)
AC_MSG_RESULT([${can_compile_avx512}])
else
can_compile_avx512=no
fi
#if test "${can_compile_avx512}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m512d q1 = _mm512_load_pd(q);
# __m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
# return 0;
# }
# ])],
# [can_compile_avx512=yes],
# [can_compile_avx512=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx512}])
# if test "${can_compile_avx512}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX512!])
# fi
#fi
dnl if test "${can_compile_avx2}" = "yes" ; then
dnl AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
dnl AC_LANG_PUSH([C++])
...
...
@@ -415,6 +535,23 @@ else
install_complex_avx2_block2=no
fi
if test "${can_compile_avx512}" = "yes" ; then
install_real_avx512_block2=yes
install_real_avx512_block4=yes
install_real_avx512_block6=yes
install_complex_avx512_block1=yes
install_complex_avx512_block2=yes
else
install_real_avx512_block2=no
install_real_avx512_block4=no
install_real_avx512_block6=no
install_complex_avx512_block1=no
install_complex_avx512_block2=no
fi
AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"])
if test x"${can_compile_sse_assembly}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU])
...
...
@@ -432,6 +569,10 @@ AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
if test x"${can_compile_avx2}" = x"yes" ; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX512],[test x"$can_compile_avx512" = x"yes"])
if test x"${can_compile_avx512}" = x"yes" ; then
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
fi
dnl set the AVX optimization flags if this option is specified
AC_MSG_CHECKING(whether AVX optimization flags should be set automatically)
...
...
@@ -515,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then
else
dnl first check blas
AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
AC_SEARCH_LIBS([dgemm],[
openblas satlas
blas],[have_blas=yes],[have_blas=no])
AC_MSG_CHECKING([whether we can link a program with a blas lib])
AC_MSG_RESULT([${have_blas}])
...
...
@@ -616,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi
dnl check whether BAND_TO_FULL_BLOCKING is set
use_band_to_full_blocking=yes
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--enable-band-to-full-blocking],
[build ELPA2 with blocking in band_to_full (default: enabled)])],
want_band_to_full_blocking="yes", want_to_full_blocking="no")
AC_MSG_RESULT([${want_band_to_full_blocking}])
if test x"${enableval}" = x"no" ; then
use_band_to_full_blocking=no
fi
AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$use_band_to_full_blocking" = x"yes"])
if test x"${use_band_to_full_blocking}" = x"yes"; then
AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
fi
dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/"
...
...
@@ -635,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=value],
[use compute capability "value" for GPU version (default sm_35)])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
fi
fi
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch
sm_35
-O2 -I$CUDA_INSTALL_PATH/include"
CUDA_CFLAGS="$CUDA_CFLAGS -arch
$cuda_compute_capability
-O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
...
...
@@ -801,6 +978,16 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx2-block4-kernel-only],[real-avx2-blo
dnl real-avx2-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx2-block6-kernel-only],[real-avx2-block6-kernel],[install_real_avx2_block6])
dnl real-avx512-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block2-kernel-only],[real-avx512-block2-kernel],[install_real_avx512_block2])
dnl real-avx512-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block4-kernel-only],[real-avx512-block4-kernel],[install_real_avx512_block4])
dnl real-avx512-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx512-block6-kernel-only],[real-avx512-block6-kernel],[install_real_avx512_block6])
dnl complex kernels
dnl generic kernel
...
...
@@ -836,6 +1023,13 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx2-block1-kernel-only],[complex
dnl complex-avx2-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx2-block2-kernel-only],[complex-avx2-block2-kernel],[install_complex_avx2_block2])
dnl complex-avx512-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx512-block1-kernel-only],[complex-avx512-block1-kernel],[install_complex_avx512_block1])
dnl complex-avx512-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx512-block2-kernel-only],[complex-avx512-block2-kernel],[install_complex_avx512_block2])
dnl set the conditionals according to the previous tests
...
...
@@ -923,6 +1117,21 @@ if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK2_KERNEL],[test x"$install_real_avx512_block2" = x"yes"])
if test x"${install_real_avx512_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK2_KERNEL],[1],[can use real_avx512_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK4_KERNEL],[test x"$install_real_avx512_block4" = x"yes"])
if test x"${install_real_avx512_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK4_KERNEL],[1],[can use real_avx512_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX512_BLOCK6_KERNEL],[test x"$install_real_avx512_block6" = x"yes"])
if test x"${install_real_avx512_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX512_BLOCK6_KERNEL],[1],[can use real_avx512_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
...
...
@@ -953,6 +1162,16 @@ if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX512_BLOCK1_KERNEL],[test x"$install_complex_avx512_block1" = x"yes"])
if test x"${install_complex_avx512_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX512_BLOCK1_KERNEL],[1],[can use complex_avx512_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX512_BLOCK2_KERNEL],[test x"$install_complex_avx512_block2" = x"yes"])
if test x"${install_complex_avx512_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX512_BLOCK2_KERNEL],[1],[can use complex_avx512_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...
...
@@ -1063,6 +1282,20 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in
])
AC_MSG_CHECKING([if workaround for Intel's broken preprocessor is needed])
if test x"$FC" = x"mpiifort" ; then
need_manual_cpp=yes
fi
if test x"$FC" = x"ifort" ; then
need_manual_cpp=yes
fi
if test x"$need_manual_cpp" = x"yes" ; then
AC_MSG_RESULT([yes])
FC="\$(top_srcdir)/manual_cpp $FC"
else
AC_MSG_RESULT([no])
fi
AC_OUTPUT
if test "${can_compile_avx}" = "no" ; then
...
...
@@ -1075,6 +1308,12 @@ if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
# fi
fi
if test "${can_compile_avx512}" = "no" ; then
# if test x"${want_avx512}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX512 instructions])
# fi
fi
if test "${can_compile_sse}" = "no" ; then
AC_MSG_WARN([Could not compile SSE instructions])
...
...
elpa/elpa_kernel_constants.h
View file @
6a8f926a
...
...
@@ -12,9 +12,12 @@
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK2 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK4 16
#define ELPA2_REAL_KERNEL_AVX512_BLOCK6 17
#define ELPA2_REAL_KERNEL_GPU 18
#define ELPA2_NUMBER_OF_REAL_KERNELS 1
5
#define ELPA2_NUMBER_OF_REAL_KERNELS 1
8
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
...
...
@@ -27,7 +30,9 @@
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2 13
#define ELPA2_
NUMBER_OF_
COMPLEX_KERNEL
S
1
2
#define ELPA2_COMPLEX_KERNEL
_GPU
1
4
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 14
m4/ax_elpa_specific_kernels.m4
View file @
6a8f926a
...
...
@@ -25,9 +25,17 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
install_real_avx_block2=no
install_real_avx_block4=no
install_real_avx_block6=no
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_real_avx512_block2=no
install_real_avx512_block4=no
install_real_avx512_block6=no
want_sse=no
want_avx=no
want_avx2=no
want_avx512=no
install_gpu=no
use_specific_real_kernel=yes
...
...
@@ -42,6 +50,15 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
AC_MSG_NOTICE([$1 set. Also avx_block2 is needed])
install_real_avx_block2=yes
fi
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also avx2_block2 is needed])
install_real_avx2_block2=yes
fi
if test x"${install_real_avx512_block4}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also avx512_block2 is needed])
install_real_avx512_block2=yes
fi
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_MSG_NOTICE([$1 set. Also sse_block2 is needed])
AC_MSG_NOTICE([$1 set. Also sse_block4 is needed])
...
...
@@ -54,6 +71,19 @@ AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
install_real_avx_block4=yes
install_real_avx_block2=yes
fi
if test x"${install_real_avx2_block6}" = x"yes" ; then