Commit aa5adc8d authored by Andreas Marek's avatar Andreas Marek

Merge branch 'NEON' into 'master_pre_stage'

Neon

See merge request !18
parents 5cab7d71 fc70acb2
This diff is collapsed.
......@@ -893,62 +893,61 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
@top_srcdir@/src/elpa2/kernels/simple_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/complex_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
......
......@@ -110,6 +110,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \
......@@ -194,9 +195,10 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif
#if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
#endif
if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......@@ -227,6 +229,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -269,6 +278,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -311,6 +327,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -768,28 +791,15 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
src/elpa2/kernels/complex_avx512_1hv_template.c \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx-avx2_2hv_template.c \
src/elpa2/kernels/real_avx-avx2_4hv_template.c \
src/elpa2/kernels/real_avx-avx2_6hv_template.c \
src/elpa2/kernels/real_avx512_2hv_template.c \
src/elpa2/kernels/real_avx512_4hv_template.c \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
This diff is collapsed.
......@@ -125,7 +125,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
# GPU runners
if [ "$CI_RUNNER_TAGS" == "gpu" ]
......@@ -144,16 +144,17 @@ then
cat ./run_${CLUSTER}_1node_2GPU.sh
echo " "
echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node_2GPU.sh
exitCode=$?
if sbatch -W ./run_${CLUSTER}_1node_2GPU.sh; then
exitCode=$?
else
exitCode=$?
echo "Submission exited with exitCode $exitCode"
fi
echo " "
echo "Exit Code of sbatch: $exitCode"
echo " "
if (( $exitCode > 0 ))
then
#if (( $exitCode > 0 ))
#then
cat ./ELPA_CI_2gpu.err.*
fi
#fi
fi
......@@ -174,24 +175,31 @@ then
cat ./run_${CLUSTER}_1node.sh
echo " "
echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node.sh
exitCode=$?
if sbatch -W ./run_${CLUSTER}_1node.sh; then
exitCode=$?
else
exitCode=$?
echo "Submission excited with exitCode $exitCode"
fi
echo " "
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
fi
#fi
fi
if [ $exitCode -ne 0 ]
#if [ $exitCode -ne 0 ]
#then
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
#fi
exit $exitCode
......
......@@ -102,7 +102,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#distcheck
if [[ "$CI_RUNNER_TAGS" =~ "distcheck" ]]
......@@ -126,17 +126,21 @@ then
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
fi
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./test-suite.log
fi
#fi
exit $exitCode
......
......@@ -120,7 +120,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#project_test
if [[ "$CI_RUNNER_TAGS" =~ "project_test" ]]
......@@ -129,18 +129,18 @@ then
echo "mkdir -p build" >> ./run_${CLUSTER}_1node.sh
echo "pushd build" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh
echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$configureArgs" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make install" >> ./run_${CLUSTER}_1node.sh
echo "#Running make install" >> ./run_${CLUSTER}_1node.sh
echo "make install" >> ./run_${CLUSTER}_1node.sh
echo "popd" >> ./run_${CLUSTER}_1node.sh
echo "mkdir -p $projectName/build" >> ./run_${CLUSTER}_1node.sh
......@@ -149,19 +149,19 @@ then
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " Testting project " >> ./run_${CLUSTER}_1node.sh
echo " #Testting project " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh
echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$projectConfigureArgs " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo "export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:\$LD_LIBRARY_PATH" >> ./run_${CLUSTER}_1node.sh
echo "./$projectExecutable" >> ./run_${CLUSTER}_1node.sh
......@@ -184,18 +184,17 @@ then
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
fi
if [ $exitCode -ne 0 ]
then
cat ./test-suite.log
fi
exit $exitCode
fi
......@@ -50,13 +50,15 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make])
fi
enable_legacy=no
AC_MSG_CHECKING(whether legacy interface should be provided)
AC_ARG_ENABLE([legacy-interface],
AS_HELP_STRING([--enable-legacy-interface],
[build legacy API, default no]),
[
if test x"$enableval" = x"yes"; then
enable_legayc=yes
enable_legacy=yes
else
enable_legacy=no
fi
......@@ -227,9 +229,9 @@ fi
dnl check which MPI binray invokes a MPI job
if test x"$with_mpi" = x"yes"; then
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun], [no])
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun aprun], [no])
if test x"$MPI_BINARY" = x"no"; then
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun])
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun, aprun])
fi
fi
......@@ -613,6 +615,7 @@ m4_define(elpa_m4_generic_kernels, [
real_generic
real_generic_simple
real_generic_simple_block4
real_generic_simple_block6
complex_generic
complex_generic_simple
])
......@@ -636,6 +639,12 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block6
])
m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
......@@ -681,7 +690,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -715,6 +724,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([neon_arch64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
......@@ -730,7 +740,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -790,7 +800,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -848,7 +858,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -895,7 +905,6 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -917,6 +926,27 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi
if test x"${need_neon_arch64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_neon.h>
int main(int argc, char **argv) {
__Float64x2_t x1, x2, x3, x4;
x4 = vfmaq_f64(x1, x2, x3);
return 0;
}
])],
[can_compile_neon_arch64=yes],
[can_compile_neon_arch64=no]
)
AC_MSG_RESULT([${can_compile_neon_arch64}])
if test x"$can_compile_neon_arch64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
......@@ -44,10 +44,14 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
......@@ -339,6 +339,7 @@
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
......@@ -850,6 +851,43 @@
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of vsx block 2 real case
......@@ -1656,6 +1694,77 @@
#endif /* REALCASE */
#if REALCASE == 1
! neon_arch64 block 4 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
......@@ -2040,6 +2149,94 @@
#endif /* REALCASE */
#if REALCASE == 1
!neon_arch64 block6 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP