...
 
Commits (41)
This source diff could not be displayed because it is too large. You can view the blob instead.
Changelog for ELPA 2018.11.001.rc1 Changelog for upcoming release
- user can define the default kernels
- simple block4 and block6 real kernel
- ELPA versioning number is provided in the C header files
Changelog for ELPA 2018.11.001
- improved autotuning - improved autotuning
- improved performance of generalized problem via Cannon's algorithm - improved performance of generalized problem via Cannon's algorithm
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## Preamble ## ## Preamble ##
This file provides documentation on how to build the *ELPA* library in **version ELPA-2018.11.001.rc1**. This file provides documentation on how to build the *ELPA* library in **version ELPA-2018.11.001**.
With release of **version ELPA-2017.05.001** the build process has been significantly simplified, With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
which makes it easier to install the *ELPA* library. which makes it easier to install the *ELPA* library.
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
For more details and recent updates please visit the online [issue system] (https://gitlab.mpcdf.mpg.de/elpa/elpa/issues) For more details and recent updates please visit the online [issue system] (https://gitlab.mpcdf.mpg.de/elpa/elpa/issues)
Issues which are not mentioned in a newer release are (considered as) solved. Issues which are not mentioned in a newer release are (considered as) solved.
### ELPA 2018.11.001.rc1 release ### ### ELPA 2018.11.001 release ###
- same issues as in ELPA 2017.11.001 - same issues as in ELPA 2017.11.001
### ELPA 2018.05.001 release ### ### ELPA 2018.05.001 release ###
......
...@@ -78,7 +78,7 @@ https://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html ...@@ -78,7 +78,7 @@ https://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
"legacy interface", since as announced some deprecated function aliases have been "legacy interface", since as announced some deprecated function aliases have been
removed). For the current interface all changes since 2017.05.001 are removed). For the current interface all changes since 2017.05.001 are
compatible, since only some functions have been added. compatible, since only some functions have been added.
The state of release 2017.11.001.(rc1) defines this interface The state of release 2017.11.001 defines this interface
- 12 - 12
No incompatible API changes w.r.t. the previous version. Some functions have been No incompatible API changes w.r.t. the previous version. Some functions have been
......
...@@ -108,6 +108,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -108,6 +108,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/real_template.F90 \ src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \ src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \ src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \ src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \ src/elpa2/compute_hh_trafo.F90 \
...@@ -188,6 +189,13 @@ if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL ...@@ -188,6 +189,13 @@ if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_simple.F90 libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_simple.F90
endif endif
if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif
#if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
#endif
if WITH_REAL_BGP_KERNEL if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90 libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif endif
...@@ -443,6 +451,7 @@ nobase_elpa_include_HEADERS = \ ...@@ -443,6 +451,7 @@ nobase_elpa_include_HEADERS = \
elpa/elpa_legacy.h elpa/elpa_legacy.h
nobase_nodist_elpa_include_HEADERS = \ nobase_nodist_elpa_include_HEADERS = \
elpa/elpa_version.h \
elpa/elpa_constants.h \ elpa/elpa_constants.h \
elpa/elpa_generated.h \ elpa/elpa_generated.h \
elpa/elpa_generated_legacy.h elpa/elpa_generated_legacy.h
...@@ -779,6 +788,7 @@ EXTRA_DIST = \ ...@@ -779,6 +788,7 @@ EXTRA_DIST = \
src/elpa2/kernels/real_sse_6hv_template.c \ src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_template.F90 \ src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \ src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \ src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \ src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \ src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## Current Release ## ## Current Release ##
The current release is ELPA 2018.11.001.rc1 The current supported API version The current release is ELPA 2018.11.001 The current supported API version
is 20181113. This release supports the earliest API version 20170403. is 20181113. This release supports the earliest API version 20170403.
The old, obsolete legacy API will be deprecated in the future ! The old, obsolete legacy API will be deprecated in the future !
...@@ -110,7 +110,7 @@ the possible configure options. ...@@ -110,7 +110,7 @@ the possible configure options.
## Using *ELPA* ## Using *ELPA*
Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online] Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online]
(http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001.rc1/html/index.html) doxygen (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001/html/index.html) doxygen
documentation, where you find the definition of the interfaces. documentation, where you find the definition of the interfaces.
## Contributing to *ELPA* ## Contributing to *ELPA*
......
This file contains the release notes for the ELPA 2018.11.001.rc1 version This file contains the release notes for the ELPA 2018.11.001 version
What is new? What is new?
------------- -------------
......
...@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst ...@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program which prints all For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program which prints all
the available kernels. the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001.rc1/html/index.html) Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001/html/index.html)
for each *ELPA* release is available. for each *ELPA* release is available.
...@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst ...@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all
the available kernels. the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001.rc1/html/index.html) Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001/html/index.html)
for each *ELPA* release is available. for each *ELPA* release is available.
...@@ -200,7 +200,7 @@ The following table gives a list of all supported parameters which can be used t ...@@ -200,7 +200,7 @@ The following table gives a list of all supported parameters which can be used t
## III) List of computational routines ## ## III) List of computational routines ##
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001.rc1/html/index.html) for details. The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001/html/index.html) for details.
| Name | Purpose | since API version | | Name | Purpose | since API version |
......
...@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts: ...@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts:
Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices. Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices.
Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001.rc1/html/index.html) for details): Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2018.11.001/html/index.html) for details):
- elpa_get_communicators : set the row / column communicators for *ELPA* - elpa_get_communicators : set the row / column communicators for *ELPA*
- elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver - elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver
......
if [ "$(hostname)" == "freya01" ]; then module purge && source /mpcdf/soft/try_new_modules.sh && module load git intel/17.0 gcc/7 impi/2017.3 mkl/2017.3 autoconf automake libtool pkg-config anaconda/3 && unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP; fi if [ "$(hostname)" == "freya01" ]; then module purge && source /mpcdf/soft/obs_modules.sh && module load git intel/18.0.3 impi/2018.3 mkl/2018.4 anaconda/3/5.1 mpi4py/3.0.0 gcc/8 autoconf automake libtool pkg-config && unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP; fi
if [ "$(hostname)" == "buildtest-rzg" ]; then module load impi/5.1.3 intel/16.0 gcc/6.3 mkl/11.3 autotools pkg-config; fi if [ "$(hostname)" == "buildtest-rzg" ]; then module load impi/5.1.3 intel/16.0 gcc/6.3 mkl/11.3 autotools pkg-config; fi
...@@ -14,8 +14,8 @@ if [ "$(hostname)" == "amarek-elpa-gitlab-runner-2" ]; then module load intel/16 ...@@ -14,8 +14,8 @@ if [ "$(hostname)" == "amarek-elpa-gitlab-runner-2" ]; then module load intel/16
if [ "$(hostname)" == "amarek-elpa-gitlab-runner-3" ]; then module load intel/16.0 gcc mkl/11.3 autoconf automake libtool impi/5.1.3; fi if [ "$(hostname)" == "amarek-elpa-gitlab-runner-3" ]; then module load intel/16.0 gcc mkl/11.3 autoconf automake libtool impi/5.1.3; fi
if [ "$(hostname)" == "amarek-elpa-gitlab-runner-4" ]; then module load intel/16.0 gcc mkl/11.3 autoconf automake libtool impi/5.1.3; fi if [ "$(hostname)" == "amarek-elpa-gitlab-runner-4" ]; then module load intel/16.0 gcc mkl/11.3 autoconf automake libtool impi/5.1.3; fi
if [ "$(hostname)" == "dvl01" ]; then module load intel/17.0 gcc/5.4 mkl/2017 impi/2017.2 gcc/5.4 cuda/8.0; fi if [ "$(hostname)" == "dvl01" ]; then module load intel/17.0 gcc/6.4 mkl/2017 impi/2017.4 cuda/9.2; fi
if [ "$(hostname)" == "dvl02" ]; then module load intel/17.0 gcc/5.4 mkl/2017 impi/2017.2 gcc/5.4 cuda/8.0; fi if [ "$(hostname)" == "dvl02" ]; then module load intel/17.0 gcc/6.4 mkl/2017 impi/2017.4 cuda/9.2; fi
if [ "$(hostname)" == "miy01" ]; then module purge && module load gcc/5.4 smpi essl/5.5 cuda pgi/17.9 && export LD_LIBRARY_PATH=/opt/ibm/spectrum_mpi/lib:/opt/ibm/spectrum_mpi/profilesupport/lib:$LD_LIBRARY_PATH && export PATH=/opt/ibm/spectrum_mpi/bin:$PATH && export OMPI_CC=gcc && export OMPI_FC=gfortran; fi if [ "$(hostname)" == "miy01" ]; then module purge && module load gcc/5.4 smpi essl/5.5 cuda pgi/17.9 && export LD_LIBRARY_PATH=/opt/ibm/spectrum_mpi/lib:/opt/ibm/spectrum_mpi/profilesupport/lib:$LD_LIBRARY_PATH && export PATH=/opt/ibm/spectrum_mpi/bin:$PATH && export OMPI_CC=gcc && export OMPI_FC=gfortran; fi
if [ "$(hostname)" == "miy02" ]; then module load gcc/5.4 pgi/17.9 ompi/pgi/17.9/1.10.2 essl/5.5 cuda && export LD_LIBRARY_PATH=/opt/ibm/spectrum_mpi/lib:/opt/ibm/spectrum_mpi/profilesupport/lib:$LD_LIBRARY_PATH && export PATH=/opt/ibm/spectrum_mpi/bin:$PATH; fi if [ "$(hostname)" == "miy02" ]; then module load gcc/5.4 pgi/17.9 ompi/pgi/17.9/1.10.2 essl/5.5 cuda && export LD_LIBRARY_PATH=/opt/ibm/spectrum_mpi/lib:/opt/ibm/spectrum_mpi/profilesupport/lib:$LD_LIBRARY_PATH && export PATH=/opt/ibm/spectrum_mpi/bin:$PATH; fi
......
#!/bin/bash #!/bin/bash
source /etc/profile.d/modules.sh #source /etc/profile.d/modules.sh
if [ -f /etc/profile.d/modules.sh ]; then source /etc/profile.d/modules.sh ; else source /etc/profile.d/mpcdf_modules.sh; fi
set -ex set -ex
source ./ci_test_scripts/.ci-env-vars source ./ci_test_scripts/.ci-env-vars
......
#!/bin/bash #!/bin/bash
source /etc/profile.d/modules.sh
#source /etc/profile.d/modules.sh
if [ -f /etc/profile.d/modules.sh ]; then source /etc/profile.d/modules.sh ; else source /etc/profile.d/mpcdf_modules.sh; fi
set -ex set -ex
source ./ci_test_scripts/.ci-env-vars source ./ci_test_scripts/.ci-env-vars
......
...@@ -336,6 +336,19 @@ print(" # stupid 'make distcheck' leaves behind write-protected files that th ...@@ -336,6 +336,19 @@ print(" # stupid 'make distcheck' leaves behind write-protected files that th
print(' - make distcheck DISTCHECK_CONFIGURE_FLAGS="FC=mpiifort FCFLAGS=\\"-xHost\\" CFLAGS=\\"-march=native\\" SCALAPACK_LDFLAGS=\\"$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP\\" SCALAPACK_FCFLAGS=\\"$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP\\" --with-mpi=yes --disable-sse-assembly --disable-sse --disable-avx --disable-avx2" TASKS=2 TEST_FLAGS="150 50 16" || { chmod u+rwX -R . ; exit 1 ; }') print(' - make distcheck DISTCHECK_CONFIGURE_FLAGS="FC=mpiifort FCFLAGS=\\"-xHost\\" CFLAGS=\\"-march=native\\" SCALAPACK_LDFLAGS=\\"$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP\\" SCALAPACK_FCFLAGS=\\"$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP\\" --with-mpi=yes --disable-sse-assembly --disable-sse --disable-avx --disable-avx2" TASKS=2 TEST_FLAGS="150 50 16" || { chmod u+rwX -R . ; exit 1 ; }')
print("\n\n") print("\n\n")
print("distcheck-no-autotune:")
print(" tags:")
print(" - buildtest")
print(" script:")
print(" - ./configure FC=mpiifort FCFLAGS=\"-xHost\" CFLAGS=\"-march=native\" SCALAPACK_LDFLAGS=\"$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP\" SCALAPACK_FCFLAGS=\"$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP\" --enable-option-checking=fatal --with-mpi=yes --disable-sse-assembly --disable-sse --disable-avx --disable-avx2 --disable-autotuning || { cat config.log; exit 1; }")
print(" # stupid 'make distcheck' leaves behind write-protected files that the stupid gitlab runner cannot remove")
print(' - make distcheck DISTCHECK_CONFIGURE_FLAGS="FC=mpiifort FCFLAGS=\\"-xHost\\" CFLAGS=\\"-march=native\\" SCALAPACK_LDFLAGS=\\"$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP\\" SCALAPACK_FCFLAGS=\\"$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP\\" --with-mpi=yes --disable-sse-assembly --disable-sse --disable-avx --disable-avx2 --disable-autotuning " TASKS=2 TEST_FLAGS="150 50 16" || { chmod u+rwX -R . ; exit 1 ; }')
print("\n\n")
# add python tests # add python tests
python_ci_tests = [ python_ci_tests = [
"# python tests", "# python tests",
......
...@@ -13,6 +13,7 @@ configueArg="" ...@@ -13,6 +13,7 @@ configueArg=""
skipStep=0 skipStep=0
batchCommand="" batchCommand=""
interactiveRun="yes" interactiveRun="yes"
SLURMBATCH="no"
function usage() { function usage() {
cat >&2 <<-EOF cat >&2 <<-EOF
...@@ -58,7 +59,7 @@ function usage() { ...@@ -58,7 +59,7 @@ function usage() {
} }
while getopts "c:t:j:m:n:b:o:s:q:i:h" opt; do while getopts "c:t:j:m:n:b:o:s:q:S:i:h" opt; do
case $opt in case $opt in
j) j)
makeTasks=$OPTARG;; makeTasks=$OPTARG;;
...@@ -80,6 +81,8 @@ while getopts "c:t:j:m:n:b:o:s:q:i:h" opt; do ...@@ -80,6 +81,8 @@ while getopts "c:t:j:m:n:b:o:s:q:i:h" opt; do
batchCommand=$OPTARG;; batchCommand=$OPTARG;;
i) i)
interactiveRun=$OPTARG;; interactiveRun=$OPTARG;;
S)
SLURMBATCH=$OPTARG;;
:) :)
echo "Option -$OPTARG requires an argument" >&2;; echo "Option -$OPTARG requires an argument" >&2;;
h) h)
......
#!/bin/bash #!/bin/bash
source /etc/profile.d/modules.sh #source /etc/profile.d/modules.sh
if [ -f /etc/profile.d/modules.sh ]; then source /etc/profile.d/modules.sh ; else source /etc/profile.d/mpcdf_modules.sh; fi
set -ex set -ex
source ./ci_test_scripts/.ci-env-vars source ./ci_test_scripts/.ci-env-vars
......
...@@ -29,12 +29,21 @@ AM_SILENT_RULES([yes]) ...@@ -29,12 +29,21 @@ AM_SILENT_RULES([yes])
# #
AC_SUBST([ELPA_SO_VERSION], [13:0:0]) AC_SUBST([ELPA_SO_VERSION], [13:0:0])
# AC_DEFINE_SUBST(NAME, VALUE, DESCRIPTION)
# -----------------------------------------
AC_DEFUN([AC_DEFINE_SUBST], [
AC_DEFINE([$1], [$2], [$3])
AC_SUBST([$1], ['$2'])
])
# API Version # API Version
AC_DEFINE([EARLIEST_API_VERSION], [20170403], [Earliest supported ELPA API version]) AC_DEFINE([EARLIEST_API_VERSION], [20170403], [Earliest supported ELPA API version])
AC_DEFINE([CURRENT_API_VERSION], [20181113], [Current ELPA API version])
AC_DEFINE_SUBST(CURRENT_API_VERSION, 20181113, "Current ELPA API version")
# Autotune Version # Autotune Version
AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, which supports autotuning]) AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, which supports autotuning])
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20181113], [Current ELPA autotune version]) AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20181113], [Current ELPA autotune version])
AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20181113, "Current ELPA autotune version")
AX_CHECK_GNU_MAKE() AX_CHECK_GNU_MAKE()
if test x$_cv_gnu_make_command = x ; then if test x$_cv_gnu_make_command = x ; then
...@@ -540,6 +549,7 @@ m4_pattern_forbid([elpa_m4]) ...@@ -540,6 +549,7 @@ m4_pattern_forbid([elpa_m4])
m4_define(elpa_m4_generic_kernels, [ m4_define(elpa_m4_generic_kernels, [
real_generic real_generic
real_generic_simple real_generic_simple
real_generic_simple_block4
complex_generic complex_generic
complex_generic_simple complex_generic_simple
]) ])
...@@ -748,6 +758,30 @@ m4_foreach_w([elpa_m4_type],elpa_m4_kernel_types,[ ...@@ -748,6 +758,30 @@ m4_foreach_w([elpa_m4_type],elpa_m4_kernel_types,[
dnl the list of kernels is now assembled dnl the list of kernels is now assembled
dnl choosing a default kernel dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[
AC_ARG_WITH([default-]elpa_m4_kind[-kernel], m4_expand([AS_HELP_STRING([--with-default-]elpa_m4_kind[-kernel]=KERNEL,
[set a specific ]elpa_m4_kind[ kernel as default kernel. Available kernels are:]
m4_foreach_w([elpa_m4_kernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[m4_bpatsubst(elpa_m4_kernel,elpa_m4_kind[]_,[]) ]))]),
[default_]elpa_m4_kind[_kernel="]elpa_m4_kind[_$withval"],[default_]elpa_m4_kind[_kernel=""])
#if test -n "$default_[]elpa_m4_kind[]_kernel" ; then
# found="no"
# m4_foreach_w([elpa_m4_otherkernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
# if test "$default_]elpa_m4_kind[_kernel" = "]elpa_m4_otherkernel[" ; then
# use_[]elpa_m4_otherkernel[]=yes
# found="yes"
# else
# use_[]elpa_m4_otherkernel[]=no
# fi
# ])
# if test x"$found" = x"no" ; then
# AC_MSG_ERROR([Invalid kernel "$default_]elpa_m4_kind[_kernel" specified for --with-default-]elpa_m4_kind[-kernel])
# fi
# AC_DEFINE([WITH_DEFAULT_]m4_toupper(elpa_m4_kind)[_KERNEL],[1],[use specific ]elpa_m4_kind[ default kernel (set at compile time)])
#fi
])
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
...@@ -1257,6 +1291,7 @@ AC_CONFIG_FILES([ ...@@ -1257,6 +1291,7 @@ AC_CONFIG_FILES([
Doxyfile Doxyfile
${PKG_CONFIG_FILE}:elpa.pc.in ${PKG_CONFIG_FILE}:elpa.pc.in
elpa/elpa_constants.h elpa/elpa_constants.h
elpa/elpa_version.h
]) ])
m4_include([m4/ax_fc_check_define.m4]) m4_include([m4/ax_fc_check_define.m4])
...@@ -1404,12 +1439,12 @@ echo "* off). With the 2019.11.001 release it will be abolished! *" ...@@ -1404,12 +1439,12 @@ echo "* off). With the 2019.11.001 release it will be abolished! *"
echo "***********************************************************************" echo "***********************************************************************"
echo " " echo " "
echo " " echo " "
echo "***********************************************************************" #echo "***********************************************************************"
echo "* This is a the first release candidate of ELPA 2018.11.001.rc1 *" #echo "* This is a the first release candidate of ELPA 2018.11.001.rc1 *"
echo "* There might be still some changes until the final release of *" #echo "* There might be still some changes until the final release of *"
echo "* ELPA 2018.11.001 *" #echo "* ELPA 2018.11.001 *"
echo "***********************************************************************" #echo "***********************************************************************"
echo " " #echo " "
if test x"$enable_kcomputer" = x"yes" ; then if test x"$enable_kcomputer" = x"yes" ; then
echo " " echo " "
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
%define with_openmp 0 %define with_openmp 0
Name: elpa Name: elpa
Version: 2018.11.001.rc1 Version: 2018.11.001
Release: 1 Release: 1
Summary: A massively parallel eigenvector solver Summary: A massively parallel eigenvector solver
License: LGPL-3.0 License: LGPL-3.0
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <limits.h> #include <limits.h>
#include <complex.h> #include <complex.h>
#include <elpa/elpa_version.h>
struct elpa_struct; struct elpa_struct;
typedef struct elpa_struct *elpa_t; typedef struct elpa_struct *elpa_t;
......
...@@ -46,7 +46,8 @@ enum ELPA_SOLVERS { ...@@ -46,7 +46,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
#define ELPA_API_VERSION @CURRENT_API_VERSION@
#define ELPA_AUTOTUNE_API_VERSION @CURRENT_AUTOTUNE_VERSION@
...@@ -62,17 +62,17 @@ module mod_check_for_gpu ...@@ -62,17 +62,17 @@ module mod_check_for_gpu
gpuAvailable = .false. gpuAvailable = .false.
if(cublasHandle .ne. -1) then if (cublasHandle .ne. -1) then
gpuAvailable = .true. gpuAvailable = .true.
numberOfDevices = -1 numberOfDevices = -1
if(myid == 0) then if (myid == 0) then
print *, "Skipping GPU init, should have already been initialized " print *, "Skipping GPU init, should have already been initialized "
endif endif
return return
else else
if(myid == 0) then if (myid == 0) then
print *, "Initializing the GPU devices" print *, "Initializing the GPU devices"
endif endif
endif endif
if (.not.(present(wantDebug))) then if (.not.(present(wantDebug))) then
......
...@@ -77,6 +77,14 @@ ...@@ -77,6 +77,14 @@
use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif #endif
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
! use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
!#endif
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
! use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE)) #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic use real_generic_kernel !, only : double_hh_trafo_generic
#endif #endif
...@@ -140,6 +148,7 @@ ...@@ -140,6 +148,7 @@
#if REALCASE == 1 #if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
...@@ -172,7 +181,6 @@ ...@@ -172,7 +181,6 @@
#endif #endif
real(kind=c_double) :: ttt ! MPI_WTIME always needs double real(kind=c_double) :: ttt ! MPI_WTIME always needs double
j = -99 j = -99
if (wantDebug) then if (wantDebug) then
...@@ -204,14 +212,14 @@ ...@@ -204,14 +212,14 @@
endif endif
if (wantDebug) call obj%timer%start("compute_hh_trafo_& if (wantDebug) call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_openmp" // & &_openmp" // &
#else #else
&" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -253,14 +261,14 @@ ...@@ -253,14 +261,14 @@
nl = min(my_thread*thread_width-noff, l_nev-noff) nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_& if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_openmp" // & &_openmp" // &
#else #else
&" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
return return
endif endif
...@@ -1307,7 +1315,278 @@ ...@@ -1307,7 +1315,278 @@
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if REALCASE == 1 #if REALCASE == 1
! sparc64 block4 real kernel ! generic simple block4 real kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
!#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_simple_4hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
! stripe_width, nbw)
!#endif
#else
!#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_simple_4hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
! stripe_width, nbw)
!#endif
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
nl, stripe_width, nbw)
#endif
#else
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
#endif
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), &
nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
!real generic simple block6 kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
!#ifdef USE_ASSUMED_SIZE
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
! call hexa_hh_trafo_&
! &MATH_DATATYPE&
! &_sse_6hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-5:j+off+a_off-1,istripe,my_thread), w(1:nbw,1:6), &
! nbw, nl, stripe_width, nbw)
!#endif
#else /* WITH_OPENMP */
!#ifdef USE_ASSUMED_SIZE
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
!#else
! call hexa_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_simple_6hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-5:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), &
! nbw, nl, stripe_width, nbw)
!#endif
#endif /* WITH_OPENMP */
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
!#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_simple_4hv_&
! &PRECISION&
! & (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe,my_thread), &
! w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
!#endif
#else /* WITH_OPENMP */
!#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_simple_4hv_&
! &PRECISION&
! & (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe), &
! w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
!#endif
#endif /* WITH_OPENMP */
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
nl, stripe_width, nbw)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! sparc64 block 4 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL) #if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
...@@ -1316,7 +1595,6 @@ ...@@ -1316,7 +1595,6 @@
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)) #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -1377,6 +1655,7 @@ ...@@ -1377,6 +1655,7 @@
#endif /* REALCASE */ #endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
! vsx block4 real kernel ! vsx block4 real kernel
......
...@@ -138,6 +138,13 @@ program print_available_elpa2_kernels ...@@ -138,6 +138,13 @@ program print_available_elpa2_kernels
do i = 0, elpa_option_cardinality(KERNEL_KEY) do i = 0, elpa_option_cardinality(KERNEL_KEY)
kernel = elpa_option_enumerate(KERNEL_KEY, i) kernel = elpa_option_enumerate(KERNEL_KEY, i)
if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. &
elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then
if (e%can_set("use_gpu",1) == ELPA_OK) then
call e%set("use_gpu",1)
endif
endif
if (e%can_set(KERNEL_KEY, kernel) == ELPA_OK) then if (e%can_set(KERNEL_KEY, kernel) == ELPA_OK) then
print *, " ", elpa_int_value_to_string(KERNEL_KEY, kernel) print *, " ", elpa_int_value_to_string(KERNEL_KEY, kernel)
endif endif
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#include "config-f90.h"
!#ifndef USE_ASSUMED_SIZE
!module real_generic_simple_block4_kernel
!
! private
! public quad_hh_trafo_real_generic_simple_4hv_double
!
!#ifdef WANT_SINGLE_PRECISION_REAL
! public quad_hh_trafo_real_generic_simple_4hv_single
!#endif
!
! contains
!#endif
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block4_template.F90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block4_template.F90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
!#ifndef USE_ASSUMED_SIZE
!end module real_generic_simple_block4_kernel
!#endif
! --------------------------------------------------------------------------------------------------
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: A. Marek, MPCDF
! --------------------------------------------------------------------------------------------------
#endif
#include "config-f90.h"
!#ifndef USE_ASSUMED_SIZE
!module real_generic_simple_block6_kernel
!
! private
! public hexa_hh_trafo_real_generic_simple_6hv_double
!
!#ifdef WANT_SINGLE_PRECISION_REAL
! public hexa_hh_trafo_real_generic_simple_6hv_single
!#endif
!
! contains
!#endif
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block6_template.F90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block6_template.F90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
!#ifndef USE_ASSUMED_SIZE
!end module real_generic_simple_block6_kernel
!#endif
! --------------------------------------------------------------------------------------------------
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#define _SSE_MUL _mm_mul_pd #define _SSE_MUL _mm_mul_pd
#define _SSE_XOR _mm_xor_pd #define _SSE_XOR _mm_xor_pd
#define _SSE_STORE _mm_store_pd #define _SSE_STORE _mm_store_pd
#define _SSE_SET _mm_set_pd
#define _SSE_SET1 _mm_set1_pd
#define offset 2 #define offset 2
#endif #endif
...@@ -75,6 +77,8 @@ ...@@ -75,6 +77,8 @@
#define _SSE_MUL _mm_mul_ps #define _SSE_MUL _mm_mul_ps
#define _SSE_XOR _mm_xor_ps #define _SSE_XOR _mm_xor_ps
#define _SSE_STORE _mm_store_ps #define _SSE_STORE _mm_store_ps
#define _SSE_SET _mm_set_ps
#define _SSE_SET1 _mm_set1_ps
#define offset 4 #define offset 4
#endif #endif
...@@ -469,20 +473,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -469,20 +473,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
__SSE_DATATYPE x6 = _SSE_LOAD(&q[ldq+5*offset]); __SSE_DATATYPE x6 = _SSE_LOAD(&q[ldq+5*offset]);
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h1 = _SSE_SET1(hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h1 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]);
#endif
#endif #endif
__SSE_DATATYPE h2; __SSE_DATATYPE h2;
...@@ -502,24 +496,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -502,24 +496,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
for(i = 2; i < nb; i++) for(i = 2; i < nb; i++)
{ {
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[i-1]);
h1 = _mm_set1_pd(hh[i-1]); h2 = _SSE_SET1(hh[ldh+i]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[i-1], hh[i-1]);
h1 = _mm_set_pd(hh[i-1], hh[i-1]); h2 = _SSE_SET(hh[ldh+i], hh[ldh+i]);
h2 = _mm_set_pd(hh[ldh+i], hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[i-1], hh[i-1]);
h2 = _mm_set_ps(hh[ldh+i], hh[ldh+i]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[i*ldq]); q1 = _SSE_LOAD(&q[i*ldq]);
...@@ -542,20 +524,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -542,20 +524,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2)); y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2));
} }
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[nb-1], hh[nb-1]);
h1 = _mm_set_pd(hh[nb-1], hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-1], hh[nb-1]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[nb*ldq]); q1 = _SSE_LOAD(&q[nb*ldq]);
...@@ -574,28 +546,14 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -574,28 +546,14 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
// Rank-2 update of Q [12 x nb+1] // Rank-2 update of Q [12 x nb+1]
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _SSE_SET1(hh[0]);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]); __SSE_DATATYPE tau2 = _SSE_SET1(hh[ldh]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]); __SSE_DATATYPE vs = _SSE_SET1(s);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _SSE_SET(hh[0], hh[0]);
__SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]); __SSE_DATATYPE tau2 = _SSE_SET(hh[ldh], hh[ldh]);
__SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]); __SSE_DATATYPE vs = _SSE_SET(s, s);
__SSE_DATATYPE vs = _mm_set_pd(s, s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set_ps(hh[0], hh[0]);
__SSE_DATATYPE tau2 = _mm_set_ps(hh[ldh], hh[ldh]);
__SSE_DATATYPE vs = _mm_set_ps(s, s);
#endif
#endif #endif
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
...@@ -644,21 +602,11 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -644,21 +602,11 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
_SSE_STORE(&q[5*offset],q6); _SSE_STORE(&q[5*offset],q6);
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h2 = _SSE_SET1(hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h2 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[ldq]); q1 = _SSE_LOAD(&q[ldq]);
...@@ -683,24 +631,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -683,24 +631,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[i-1]);
h1 = _mm_set1_pd(hh[i-1]); h2 = _SSE_SET1(hh[ldh+i]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[i-1], hh[i-1]);
h1 = _mm_set_pd(hh[i-1], hh[i-1]); h2 = _SSE_SET(hh[ldh+i], hh[ldh+i]);
h2 = _mm_set_pd(hh[ldh+i], hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[i-1], hh[i-1]);
h2 = _mm_set_ps(hh[ldh+i], hh[ldh+i]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[i*ldq]); q1 = _SSE_LOAD(&q[i*ldq]);
...@@ -723,20 +659,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -723,20 +659,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
_SSE_STORE(&q[(i*ldq)+5*offset],q6); _SSE_STORE(&q[(i*ldq)+5*offset],q6);
} }
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[nb-1], hh[nb-1]);
h1 = _mm_set_pd(hh[nb-1], hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-1], hh[nb-1]);
#endif
#endif #endif
...@@ -813,20 +739,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -813,20 +739,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
__SSE_DATATYPE x5 = _SSE_LOAD(&q[ldq+4*offset]); __SSE_DATATYPE x5 = _SSE_LOAD(&q[ldq+4*offset]);
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h1 = _SSE_SET1(hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h1 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]);
#endif
#endif #endif
__SSE_DATATYPE h2; __SSE_DATATYPE h2;
...@@ -844,24 +760,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -844,24 +760,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
for(i = 2; i < nb; i++) for(i = 2; i < nb; i++)
{ {
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[i-1]);
h1 = _mm_set1_pd(hh[i-1]); h2 = _SSE_SET1(hh[ldh+i]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[i-1], hh[i-1]);
h1 = _mm_set_pd(hh[i-1], hh[i-1]); h2 = _SSE_SET(hh[ldh+i], hh[ldh+i]);
h2 = _mm_set_pd(hh[ldh+i], hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[i-1], hh[i-1]);
h2 = _mm_set_ps(hh[ldh+i], hh[ldh+i]);
#endif
#endif #endif
...@@ -883,20 +787,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -883,20 +787,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
} }
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET(hh[nb-1], hh[nb-1]);
h1 = _mm_set_pd(hh[nb-1], hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-1], hh[nb-1]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[nb*ldq]); q1 = _SSE_LOAD(&q[nb*ldq]);
...@@ -913,30 +807,14 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -913,30 +807,14 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
// Rank-2 update of Q [12 x nb+1] // Rank-2 update of Q [12 x nb+1]
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _SSE_SET1(hh[0]);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]); __SSE_DATATYPE tau2 = _SSE_SET1(hh[ldh]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]); __SSE_DATATYPE vs = _SSE_SET1(s);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _SSE_SET(hh[0], hh[0]);
__SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]); __SSE_DATATYPE tau2 = _SSE_SET(hh[ldh], hh[ldh]);
__SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]); __SSE_DATATYPE vs = _SSE_SET(s, s);
__SSE_DATATYPE vs = _mm_set_pd(s, s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set_ps(hh[0], hh[0]);
__SSE_DATATYPE tau2 = _mm_set_ps(hh[ldh], hh[ldh]);
__SSE_DATATYPE vs = _mm_set_ps(s, s);
#endif
#endif #endif
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
...@@ -980,20 +858,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -980,20 +858,10 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
_SSE_STORE(&q[4*offset],q5); _SSE_STORE(&q[4*offset],q5);
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h2 = _SSE_SET1(hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif #endif
#ifdef HAVE_SPARC64_SSE #ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL h2 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]);
#endif
#endif #endif
q1 = _SSE_LOAD(&q[ldq]); q1 = _SSE_LOAD(&q[ldq]);
...@@ -1015,24 +883,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -1015,24 +883,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL h1 = _SSE_SET1(hh[i-1]);
h1 = _mm_set1_pd(hh[i-1]); h2