...
 
Commits (75)
This diff is collapsed.
......@@ -2,7 +2,8 @@ Changelog for upcoming release
- not yet decided
Changelog for ELPA 2019.05.001.rc1
Changelog for ELPA 2019.05.001
- elpa_print_kernels supports GPU usage
- fix an error if PAPI measurements are activated
- new simple real kernels: block4 and block6
......@@ -21,6 +22,9 @@ been introduced a year ago. Removed routines:
-> cholesky_real
-> cholesky_complex
-> solve_tridi
- new kernels for ARM arch64 added
- fix an out-of-bound-error in elpa2
Changelog for ELPA 2018.11.001
......
......@@ -893,62 +893,61 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
@top_srcdir@/src/elpa2/kernels/simple_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/complex_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
......
......@@ -2,7 +2,7 @@
## Preamble ##
This file provides documentation on how to build the *ELPA* library in **version ELPA-2019.05.001.rc1**.
This file provides documentation on how to build the *ELPA* library in **version ELPA-2019.05.001**.
With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
which makes it easier to install the *ELPA* library.
......@@ -16,7 +16,7 @@ With release ELPA 2019.05.001 the legacy API is disabled by default, however,
can be still switched on at build time.
Most likely with the release ELPA 2019.11.001 the legacy API will be deprecated and not supported anymore.
The release of ELPA 2019.05.001.rc1 changes the ABI and API, since it allows to also build the C-functions with optional error arguments
The release of ELPA 2019.05.001 changes the ABI and API, since it allows to also build the C-functions with optional error arguments
## How to install *ELPA* ##
......
......@@ -3,7 +3,7 @@
For more details and recent updates please visit the online [issue system] (https://gitlab.mpcdf.mpg.de/elpa/elpa/issues)
Issues which are not mentioned in a newer release are (considered as) solved.
### ELPA 2019.11.001.rc1 release ###
### ELPA 2019.11.001 release ###
- same issues as in ELPA 2017.11.001
### ELPA 2018.11.001 release ###
......
......@@ -110,6 +110,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/kernels/blas_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \
......@@ -194,9 +196,14 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif
#if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
#endif
if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
endif
if WITH_REAL_BLAS_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_blas_block4.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......@@ -227,6 +234,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -269,6 +283,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -311,6 +332,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL
#endif
endif
if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -768,28 +796,16 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
src/elpa2/kernels/complex_avx512_1hv_template.c \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx-avx2_2hv_template.c \
src/elpa2/kernels/real_avx-avx2_4hv_template.c \
src/elpa2/kernels/real_avx-avx2_6hv_template.c \
src/elpa2/kernels/real_avx512_2hv_template.c \
src/elpa2/kernels/real_avx512_4hv_template.c \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/kernels/blas_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
......@@ -2,7 +2,7 @@
## Current Release ##
The current release is ELPA 2019.05.001.rc1 The current supported API version
The current release is ELPA 2019.05.001 The current supported API version
is 20190501. This release supports the earliest API version 20170403.
The old, obsolete legacy API will be deprecated in the future !
......@@ -76,6 +76,8 @@ No other conditions have to be met.
Nonetheless, we are grateful if you cite the following publications:
If you use ELPA in general:
T. Auckenthaler, V. Blum, H.-J. Bungartz, T. Huckle, R. Johanni,
L. Kr\"amer, B. Lang, H. Lederer, and P. R. Willems,
"Parallel solution of partial symmetric eigenvalue problems from
......@@ -90,12 +92,20 @@ Nonetheless, we are grateful if you cite the following publications:
Journal of Physics Condensed Matter, 26 (2014)
doi:10.1088/0953-8984/26/21/213201
If you use the GPU version of ELPA:
Kus, P; Marek, A.; Lederer, H.
"GPU Optimization of Large-Scale Eigenvalue Solver",
In: Radu F., Kumar K., Berre I., Nordbotten J., Pop I. (eds)
Numerical Mathematics and Advanced Applications ENUMATH 2017. ENUMATH 2017.
Lecture Notes in Computational Science and Engineering, vol 126. Springer, Cham
If you use the new API and/or autotuning:
Kus; P.; Marek, A.; Koecher, S. S.; Kowalski H.-H.; Carbogno, Ch.; Scheurer, Ch.; Reuter, K.; Scheffler, M.; Lederer, H.
"Optimizations of the Eigenvaluesolvers in the ELPA Library",
Parllel Computing 85, 167-177 (2019)
## Installation of the *ELPA* library
......@@ -115,7 +125,7 @@ the possible configure options.
## Using *ELPA*
Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online]
(http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) doxygen
(http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) doxygen
documentation, where you find the definition of the interfaces.
## Contributing to *ELPA*
......
This file contains the release notes for the ELPA 2019.05.001.rc1 version
This file contains the release notes for the ELPA 2019.05.001 version
What is new?
-------------
......@@ -9,6 +9,7 @@ For detailed information about changes since release ELPA 2018.11 please have a
- C functions can have an optional error argument, if compiler supports this
=> ABI and API change
- as anounced, removal of deprecated routines
- new kernels for Arm arch64
ABI change
---------------------
......
......@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program which prints all
the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html)
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html)
for each *ELPA* release is available.
......@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all
the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html)
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html)
for each *ELPA* release is available.
......@@ -200,7 +200,7 @@ The following table gives a list of all supported parameters which can be used t
## III) List of computational routines ##
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) for details.
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) for details.
| Name | Purpose | since API version |
......
......@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts:
Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices.
Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) for details):
Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) for details):
- elpa_get_communicators : set the row / column communicators for *ELPA*
- elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver
......
This diff is collapsed.
......@@ -125,7 +125,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
# GPU runners
if [ "$CI_RUNNER_TAGS" == "gpu" ]
......@@ -144,16 +144,17 @@ then
cat ./run_${CLUSTER}_1node_2GPU.sh
echo " "
echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node_2GPU.sh
exitCode=$?
if sbatch -W ./run_${CLUSTER}_1node_2GPU.sh; then
exitCode=$?
else
exitCode=$?
echo "Submission exited with exitCode $exitCode"
fi
echo " "
echo "Exit Code of sbatch: $exitCode"
echo " "
if (( $exitCode > 0 ))
then
#if (( $exitCode > 0 ))
#then
cat ./ELPA_CI_2gpu.err.*
fi
#fi
fi
......@@ -174,24 +175,31 @@ then
cat ./run_${CLUSTER}_1node.sh
echo " "
echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node.sh
exitCode=$?
if sbatch -W ./run_${CLUSTER}_1node.sh; then
exitCode=$?
else
exitCode=$?
echo "Submission excited with exitCode $exitCode"
fi
echo " "
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
fi
#fi
fi
if [ $exitCode -ne 0 ]
#if [ $exitCode -ne 0 ]
#then
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
#fi
exit $exitCode
......
......@@ -102,7 +102,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#distcheck
if [[ "$CI_RUNNER_TAGS" =~ "distcheck" ]]
......@@ -126,17 +126,21 @@ then
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
fi
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./test-suite.log
fi
#fi
exit $exitCode
......
......@@ -120,7 +120,7 @@ then
CLUSTER="draco"
fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS"
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#project_test
if [[ "$CI_RUNNER_TAGS" =~ "project_test" ]]
......@@ -129,18 +129,18 @@ then
echo "mkdir -p build" >> ./run_${CLUSTER}_1node.sh
echo "pushd build" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh
echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$configureArgs" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make install" >> ./run_${CLUSTER}_1node.sh
echo "#Running make install" >> ./run_${CLUSTER}_1node.sh
echo "make install" >> ./run_${CLUSTER}_1node.sh
echo "popd" >> ./run_${CLUSTER}_1node.sh
echo "mkdir -p $projectName/build" >> ./run_${CLUSTER}_1node.sh
......@@ -149,19 +149,19 @@ then
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " Testting project " >> ./run_${CLUSTER}_1node.sh
echo " #Testting project " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh
echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$projectConfigureArgs " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo "export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:\$LD_LIBRARY_PATH" >> ./run_${CLUSTER}_1node.sh
echo "./$projectExecutable" >> ./run_${CLUSTER}_1node.sh
......@@ -184,18 +184,17 @@ then
echo "Exit Code of sbatch: $exitCode"
echo " "
cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ]
then
#if [ $exitCode -ne 0 ]
#then
cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi
fi
if [ $exitCode -ne 0 ]
then
cat ./test-suite.log
fi
exit $exitCode
fi
......@@ -50,13 +50,15 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make])
fi
enable_legacy=no
AC_MSG_CHECKING(whether legacy interface should be provided)
AC_ARG_ENABLE([legacy-interface],
AS_HELP_STRING([--enable-legacy-interface],
[build legacy API, default no]),
[
if test x"$enableval" = x"yes"; then
enable_legayc=yes
enable_legacy=yes
else
enable_legacy=no
fi
......@@ -227,9 +229,9 @@ fi
dnl check which MPI binray invokes a MPI job
if test x"$with_mpi" = x"yes"; then
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun], [no])
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun aprun], [no])
if test x"$MPI_BINARY" = x"no"; then
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun])
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun, aprun])
fi
fi
......@@ -613,8 +615,10 @@ m4_define(elpa_m4_generic_kernels, [
real_generic
real_generic_simple
real_generic_simple_block4
real_generic_simple_block6
complex_generic
complex_generic_simple
real_blas_block4
])
m4_define(elpa_m4_sse_assembly_kernels, [
......@@ -636,6 +640,12 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block6
])
m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
......@@ -681,7 +691,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -715,6 +725,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([neon_arch64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
......@@ -730,7 +741,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -790,7 +801,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -848,7 +859,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -895,7 +906,6 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -917,6 +927,27 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi
if test x"${need_neon_arch64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_neon.h>
int main(int argc, char **argv) {
__Float64x2_t x1, x2, x3, x4;
x4 = vfmaq_f64(x1, x2, x3);
return 0;
}
])],
[can_compile_neon_arch64=yes],
[can_compile_neon_arch64=no]
)
AC_MSG_RESULT([${can_compile_neon_arch64}])
if test x"$can_compile_neon_arch64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -1502,12 +1533,12 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
#echo "***********************************************************************"
#echo " "
#echo " "
echo "***********************************************************************"
echo "* This is a the first release candidate of ELPA 2019.05.001.rc1 *"
echo "* There might be still some changes until the final release of *"
echo "* ELPA 2019.05.001 *"
echo "***********************************************************************"
echo " "
#echo "***********************************************************************"
#echo "* This is a the first release candidate of ELPA 2019.05.001.rc2 *"
#echo "* There might be still some changes until the final release of *"
#echo "* ELPA 2019.05.001 *"
#echo "***********************************************************************"
#echo " "
if test x"$enable_kcomputer" = x"yes" ; then
echo " "
......
......@@ -19,7 +19,7 @@
%define with_openmp 0
Name: elpa
Version: 2019.05.001.rc1
Version: 2019.05.001
Release: 1
Summary: A massively parallel eigenvector solver
License: LGPL-3.0
......
......@@ -44,10 +44,15 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BLAS_BLOCK4, 30, @ELPA_2STAGE_REAL_BLAS_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
This diff is collapsed.
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#if REALCASE==1
subroutine quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (useGPU, q, hh, nb, nq, ldq, ldh)
use precision
implicit none
#include "../../general/precision_kinds.F90"
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef USE_ASSUMED_SIZE
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+3)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=C_DATATYPE_KIND) :: w_comb(nq, 4)
real(kind=C_DATATYPE_KIND) :: h_mat(4, nb+3)
real(kind=C_DATATYPE_KIND) :: s_mat(4, 4)
integer(kind=ik) :: i, j, k
! Calculate dot product of the two Householder vectors
h_mat(:,:) = 0.0_rk
h_mat(1,4) = -1.0_rk
h_mat(2,3) = -1.0_rk
h_mat(3,2) = -1.0_rk
h_mat(4,1) = -1.0_rk
h_mat(1,5:nb+3) = -hh(2:nb, 1)
h_mat(2,4:nb+2) = -hh(2:nb, 2)
h_mat(3,3:nb+1) = -hh(2:nb, 3)
h_mat(4,2:nb) = -hh(2:nb, 4)
! TODO we do not need the diagonal, but how to do it with BLAS?
!s_mat = - matmul(h_mat, transpose(h_mat))
call PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_mat, 4, &
ZERO, s_mat, 4)
! w_comb = - matmul(q(1:nq, 1:nb+3), transpose(h_mat))
call PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q, ldq, &
h_mat, 4, &
ZERO, w_comb, nq)
! Rank-1 update
!w_comb(1:nq,1) = hh(1,1) * w_comb(1:nq, 1)
call PRECISION_SCAL(nq, hh(1,1), w_comb(1:nq, 1), 1)
do i = 2, 4
! w_comb(1:nq,i) = matmul(w_comb(1:nq,1:i-1), hh(1,i) * s_mat(i,1:i-1)) + hh(1,i) * w_comb(1:nq, i)
call PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_comb(1:nq, 1:i-1), nq, &
s_mat(i,1:i-1), 1, &
hh(1,i), w_comb(1:nq,i), 1)
enddo
!q(1:nq, 1:nb+3) = matmul(w_comb, h_mat) + q(1:nq, 1:nb+3)
call PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_comb, nq, &
h_mat, 4, &
ONE, q, ldq)
end subroutine
#endif
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "complex_avx-avx2_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "complex_avx-avx2_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef SINGLE_PRECISION
#undef COMPLEXCASE
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_avx-avx2_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_avx-avx2_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "complex_avx512_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "complex_avx512_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef SINGLE_PRECISION
#undef COMPLEXCASE
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "complex_avx512_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "complex_avx512_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SSE_128
#include "../../general/precision_macros.h"
#include "complex_sse_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SSE_128
#include "../../general/precision_macros.h"
#include "complex_sse_1hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef COMPLEXCASE
#undef SINGLE_PRECISION
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_sse_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
......@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_sse_2hv_template.c"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
This diff is collapsed.
This diff is collapsed.
......@@ -48,8 +48,12 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#include "../../general/precision_macros.h"
#include "real_avx-avx2_2hv_template.c"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
#undef VEC_SET
#undef REALCASE
#undef DOUBLE_PRECISION
......@@ -48,8 +48,12 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#include "../../general/precision_macros.h"
#include "real_avx-avx2_2hv_template.c"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
#undef VEC_SET
#undef REALCASE
#undef SINGLE_PRECISION
......@@ -148,37 +148,6 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_real_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_real_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -56,7 +56,7 @@ module elpa2_utilities
implicit none
public
integer(kind=c_int), parameter :: number_of_real_kernels = ELPA_2STAGE_NUMBER_OF_REAL_KERNELS - 7
integer(kind=c_int), parameter :: number_of_real_kernels = ELPA_2STAGE_NUMBER_OF_REAL_KERNELS - 11
integer(kind=c_int), parameter :: number_of_complex_kernels = ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS
#ifdef WITH_REAL_GENERIC_KERNEL
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.