From 69792b15454a57bf11f2d4cf4cd5ea31589e32eb Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Tue, 5 Apr 2016 16:17:09 +0200 Subject: [PATCH] Introduction of new SSE kernels with different blocking The SSE kernels with blocking of 2,4,6 (real case) and 1,2 (complex) case are now available by default Thus the following changes have been done - introduce new macros in configure.ac and Makefile.am - renmae the AVX kernels in AVX_AVX2 (they also support AVX2) - introduce new files with SSE kernel - introduce new kernel parameters ! - make the SSE kernels callable The results are identical with previous kernels --- Makefile.am | 25 +- configure.ac | 55 +- elpa/elpa_kernel_constants.h | 19 +- .../elpa2_kernels_complex_avx-avx2_1hv.cpp | 5 + .../elpa2_kernels_complex_avx-avx2_2hv.cpp | 5 + .../elpa2_kernels_complex_sse_1hv.cpp | 588 ++++++ .../elpa2_kernels_complex_sse_2hv.cpp | 1465 ++++++++++++++ .../elpa2_kernels_real_avx-avx2_2hv.c | 6 + .../elpa2_kernels_real_avx-avx2_4hv.c | 5 + .../elpa2_kernels_real_avx-avx2_6hv.c | 6 + .../elpa2_kernels_real_sse_2hv.c | 849 ++++++++ .../elpa2_kernels_real_sse_4hv.c | 1302 +++++++++++++ .../elpa2_kernels_real_sse_6hv.c | 1729 +++++++++++++++++ src/elpa2_utilities.F90 | 56 +- src/mod_compute_hh_trafo_complex.F90 | 47 + src/mod_compute_hh_trafo_real.F90 | 116 ++ .../elpa_test_programs_print_headers.X90 | 11 +- 17 files changed, 6263 insertions(+), 26 deletions(-) create mode 100644 src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp create mode 100644 src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp create mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c create mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c create mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c diff --git a/Makefile.am b/Makefile.am index 7f27c097..08b9aefd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -79,31 +79,46 @@ if WITH_COMPLEX_SSE_KERNEL endif endif +if WITH_REAL_SSE_BLOCK2_KERNEL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c +endif + if WITH_REAL_AVX_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c endif +if WITH_REAL_SSE_BLOCK4_KERNEL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c +endif + if WITH_REAL_AVX_BLOCK4_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c endif +if WITH_REAL_SSE_BLOCK6_KERNEL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c +endif + if WITH_REAL_AVX_BLOCK6_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c endif +if WITH_COMPLEX_SSE_BLOCK1_KERNEL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp +endif + if WITH_COMPLEX_AVX_BLOCK1_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp endif +if WITH_COMPLEX_SSE_BLOCK2_KERNEL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp +endif + if WITH_COMPLEX_AVX_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp endif -#if WITH_AVX_SANDYBRIDGE -# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \ -# src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp -#endif - # install any .mod files in the include/ dir elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ nobase_elpa_include_HEADERS = $(wildcard modules/*) diff --git a/configure.ac b/configure.ac index 8733f8e0..d2fb4d50 100644 --- a/configure.ac +++ b/configure.ac @@ -196,12 +196,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul if test "$?" == 0; then can_compile_sse=yes install_real_sse=yes + install_real_sse_block2=yes + install_real_sse_block4=yes + install_real_sse_block6=yes + install_complex_sse=yes + install_complex_sse_block1=yes + install_complex_sse_block2=yes + else can_compile_sse=no install_real_sse=no + install_real_sse_block2=no + install_real_sse_block4=no + install_real_sse_block6=no + install_complex_sse=no + install_complex_sse_block1=no + install_complex_sse_block2=no fi + rm -f ./test.o AC_MSG_RESULT([${can_compile_sse}]) @@ -477,7 +491,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm AC_COMPILE_IFELSE([AC_LANG_SOURCE([ program test_get_environment -nn character(len=256) :: homedir call get_environment_variable("HOME",homedir) end program @@ -570,6 +583,15 @@ dnl real kernels dnl bgq kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq]) + dnl real-sse-block2 kernel + DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2]) + + dnl real-sse-block4 kernel + DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4]) + + dnl real-sse-block6 kernel + DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6]) + dnl real-avx-block2 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2]) @@ -600,6 +622,12 @@ dnl complex kernels dnl complex-bqq kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq]) + dnl complex-sse-block1 kernel + DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1]) + + dnl complex-avx-block2 kernel + DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2]) + dnl complex-avx-block1 kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1]) @@ -641,6 +669,21 @@ if test x"${install_complex_sse}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel]) fi +AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"]) +if test x"${install_real_sse_block2}" = x"yes" ; then + AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel]) +fi + +AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"]) +if test x"${install_real_sse_block4}" = x"yes" ; then + AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel]) +fi + +AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"]) +if test x"${install_real_sse_block6}" = x"yes" ; then + AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel]) +fi + AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"]) if test x"${install_real_avx_block2}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel]) @@ -656,6 +699,16 @@ if test x"${install_real_avx_block6}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel]) fi +AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"]) +if test x"${install_complex_sse_block1}" = x"yes" ; then + AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel]) +fi + +AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"]) +if test x"${install_complex_sse_block2}" = x"yes" ; then + AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel]) +fi + AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"]) if test x"${install_complex_avx_block1}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel]) diff --git a/elpa/elpa_kernel_constants.h b/elpa/elpa_kernel_constants.h index 45ae1640..f723d32c 100644 --- a/elpa/elpa_kernel_constants.h +++ b/elpa/elpa_kernel_constants.h @@ -3,11 +3,14 @@ #define ELPA2_REAL_KERNEL_BGP 3 #define ELPA2_REAL_KERNEL_BGQ 4 #define ELPA2_REAL_KERNEL_SSE 5 -#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6 -#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7 -#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8 +#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6 +#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7 +#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8 +#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9 +#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10 +#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11 -#define ELPA2_NUMBER_OF_REAL_KERNELS 8 +#define ELPA2_NUMBER_OF_REAL_KERNELS 11 #define ELPA2_COMPLEX_KERNEL_GENERIC 1 @@ -15,7 +18,9 @@ #define ELPA2_COMPLEX_KERNEL_BGP 3 #define ELPA2_COMPLEX_KERNEL_BGQ 4 #define ELPA2_COMPLEX_KERNEL_SSE 5 -#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6 -#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7 +#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6 +#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7 +#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8 +#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9 -#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7 +#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 9 diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp index a97710de..08fae40c 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp @@ -59,12 +59,15 @@ // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- +#include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) +#ifdef HAVE_AVX2 + #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) @@ -77,6 +80,8 @@ #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #endif +#endif + extern "C" { //Forward declaration diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp index eb8d3934..8a725406 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp @@ -59,12 +59,15 @@ // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- +#include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) +#ifdef HAVE_AVX2 + #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) @@ -77,6 +80,8 @@ #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #endif +#endif + extern "C" { //Forward declaration diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp new file mode 100644 index 00000000..1b5a731f --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp @@ -0,0 +1,588 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +{ + std::complex x0; + std::complex x1; + std::complex x2; + std::complex x3; + std::complex h0; + std::complex tau0; + int i=0; + + x0 = q[0]; + x1 = q[1]; + x2 = q[2]; + x3 = q[3]; + + for (i = 1; i < nb; i++) + { + h0 = conj(hh[i]); + x0 += (q[(i*ldq)+0] * h0); + x1 += (q[(i*ldq)+1] * h0); + x2 += (q[(i*ldq)+2] * h0); + x3 += (q[(i*ldq)+3] * h0); + } + + tau0 = hh[0]; + + h0 = (-1.0)*tau0; + + x0 *= h0; + x1 *= h0; + x2 *= h0; + x3 *= h0; + + q[0] += x0; + q[1] += x1; + q[2] += x2; + q[3] += x3; + + for (i = 1; i < nb; i++) + { + h0 = hh[i]; + q[(i*ldq)+0] += (x0*h0); + q[(i*ldq)+1] += (x1*h0); + q[(i*ldq)+2] += (x2*h0); + q[(i*ldq)+3] += (x3*h0); + } +} +#endif // if 0 + +void single_hh_trafo_complex_sse_1hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + //int ldh = *pldh; + + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq); + } + if (nq-i > 2) + { + hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq); + } +} + +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2, x3, x4, x5, x6; + __m128d q1, q2, q3, q4, q5, q6; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + x3 = _mm_load_pd(&q_dbl[4]); + x4 = _mm_load_pd(&q_dbl[6]); + x5 = _mm_load_pd(&q_dbl[8]); + x6 = _mm_load_pd(&q_dbl[10]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); + q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + tmp5 = _mm_mul_pd(h1_imag, q5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#else + x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#endif + tmp6 = _mm_mul_pd(h1_imag, q6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#else + x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + tmp5 = _mm_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); +#else + x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); +#endif + tmp6 = _mm_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); +#else + x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + q5 = _mm_load_pd(&q_dbl[8]); + q6 = _mm_load_pd(&q_dbl[10]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + q5 = _mm_add_pd(q5, x5); + q6 = _mm_add_pd(q6, x6); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + _mm_store_pd(&q_dbl[8], q5); + _mm_store_pd(&q_dbl[10], q6); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); + q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + tmp5 = _mm_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#else + q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#endif + tmp6 = _mm_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#else + q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + _mm_store_pd(&q_dbl[(2*i*ldq)+8], q5); + _mm_store_pd(&q_dbl[(2*i*ldq)+10], q6); + } +} + +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2, x3, x4; + __m128d q1, q2, q3, q4; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + x3 = _mm_load_pd(&q_dbl[4]); + x4 = _mm_load_pd(&q_dbl[6]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + } +} + +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2; + __m128d q1, q2; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + } +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp new file mode 100644 index 00000000..8d1c0ad4 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp @@ -0,0 +1,1465 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + std::complex x1; + std::complex x2; + std::complex x3; + std::complex x4; + std::complex y1; + std::complex y2; + std::complex y3; + std::complex y4; + std::complex h1; + std::complex h2; + std::complex tau1; + std::complex tau2; + int i=0; + + x1 = q[ldq+0]; + x2 = q[ldq+1]; + x3 = q[ldq+2]; + x4 = q[ldq+3]; + + h2 = conj(hh[ldh+1]); + + y1 = q[0] + (x1*h2); + y2 = q[1] + (x2*h2); + y3 = q[2] + (x3*h2); + y4 = q[3] + (x4*h2); + + for (i = 2; i < nb; i++) + { + h1 = conj(hh[i-1]); + h2 = conj(hh[ldh+i]); + + x1 += (q[(i*ldq)+0] * h1); + y1 += (q[(i*ldq)+0] * h2); + x2 += (q[(i*ldq)+1] * h1); + y2 += (q[(i*ldq)+1] * h2); + x3 += (q[(i*ldq)+2] * h1); + y3 += (q[(i*ldq)+2] * h2); + x4 += (q[(i*ldq)+3] * h1); + y4 += (q[(i*ldq)+3] * h2); + } + h1 = conj(hh[nb-1]); + + x1 += (q[(nb*ldq)+0] * h1); + x2 += (q[(nb*ldq)+1] * h1); + x3 += (q[(nb*ldq)+2] * h1); + x4 += (q[(nb*ldq)+3] * h1); + + tau1 = hh[0]; + tau2 = hh[ldh]; + + h1 = (-1.0)*tau1; + + x1 *= h1; + x2 *= h1; + x3 *= h1; + x4 *= h1; + + h1 = (-1.0)*tau2; + h2 = (-1.0)*tau2; + h2 *= s; + y1 = y1*h1 +x1*h2; + y2 = y2*h1 +x2*h2; + y3 = y3*h1 +x3*h2; + y4 = y4*h1 +x4*h2; + + q[0] += y1; + q[1] += y2; + q[2] += y3; + q[3] += y4; + + h2 = hh[ldh+1]; + q[ldq+0] += (x1 + (y1*h2)); + q[ldq+1] += (x2 + (y2*h2)); + q[ldq+2] += (x3 + (y3*h2)); + q[ldq+3] += (x4 + (y4*h2)); + + for (i = 2; i < nb; i++) + { + h1 = hh[i-1]; + h2 = hh[ldh+i]; + + q[(i*ldq)+0] += ((x1*h1) + (y1*h2)); + q[(i*ldq)+1] += ((x2*h1) + (y2*h2)); + q[(i*ldq)+2] += ((x3*h1) + (y3*h2)); + q[(i*ldq)+3] += ((x4*h1) + (y4*h2)); + } + + h1 = hh[nb-1]; + q[(nb*ldq)+0] += (x1*h1); + q[(nb*ldq)+1] += (x2*h1); + q[(nb*ldq)+2] += (x3*h1); + q[(nb*ldq)+3] += (x4*h1); +} +#endif + +void double_hh_trafo_complex_sse_2hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + std::complex s = conj(hh[(ldh)+1])*1.0; + for (i = 2; i < nb; i++) + { + s += hh[i-1] * conj(hh[(i+ldh)]); + } + +#if 1 + for (i = 0; i < nq; i+=4) + { + hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq-2; i+=3) + { + hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } + if (nq-i > 1) + { + hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} + +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2, x3, x4; + __m128d y1, y2, y3, y4; + __m128d q1, q2, q3, q4; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); + x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + y3 = _mm_load_pd(&q_dbl[4]); + y4 = _mm_load_pd(&q_dbl[6]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, q4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, y4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + q3 = _mm_add_pd(q3, y3); + q4 = _mm_add_pd(q4, y4); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); + q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + _mm_store_pd(&q_dbl[(ldq*2)+4], q3); + _mm_store_pd(&q_dbl[(ldq*2)+6], q4); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); +} + +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2, x3; + __m128d y1, y2, y3; + __m128d q1, q2, q3; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2, tmp3; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + y3 = _mm_load_pd(&q_dbl[4]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + q3 = _mm_add_pd(q3, y3); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + _mm_store_pd(&q_dbl[(ldq*2)+4], q3); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); +} + +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2; + __m128d y1, y2; + __m128d q1, q2; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); +} + +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1; + __m128d y1; + __m128d q1; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + __m128d tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + + q1 = _mm_add_pd(q1, y1); + + _mm_store_pd(&q_dbl[0], q1); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + + q1 = _mm_add_pd(q1, x1); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c index a5119b63..7aa1b847 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c @@ -60,10 +60,14 @@ // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + #include #define __forceinline __attribute__((always_inline)) static +#ifdef HAVE_AVX2 + #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) @@ -74,6 +78,8 @@ #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #endif +#endif + //Forward declaration __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c index 5425b696..97ba19ab 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c @@ -59,11 +59,14 @@ // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- +#include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static +#ifdef HAVE_AVX2 + #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) @@ -78,6 +81,8 @@ #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #endif +#endif + //Forward declaration __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c index 322e7b5a..67338c47 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c @@ -60,10 +60,14 @@ // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + #include #define __forceinline __attribute__((always_inline)) static +#ifdef HAVE_AVX2 + #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) @@ -78,6 +82,8 @@ #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #endif +#endif + //Forward declaration static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c new file mode 100644 index 00000000..0e37b132 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c @@ -0,0 +1,849 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); + +void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + double s = hh[(ldh)+1]*1.0; + + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding + for (i = 0; i < nq-8; i+=12) + { + hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 4) + { + hh_trafo_kernel_8_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i > 0) + { + hh_trafo_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } + } +} + +#if 0 +void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + double s = hh[(ldh)+1]*1.0; + + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=24) + { + hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} +#endif +/** + * Unrolled kernel that computes + * 12 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [12 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + __m128d x3 = _mm_load_pd(&q[ldq+4]); + __m128d x4 = _mm_load_pd(&q[ldq+6]); + __m128d x5 = _mm_load_pd(&q[ldq+8]); + __m128d x6 = _mm_load_pd(&q[ldq+10]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + +#ifdef __ELPA_USE_FMA__ + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_macc_pd(x1, h1, q1); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_macc_pd(x2, h1, q2); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_macc_pd(x3, h1, q3); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_macc_pd(x4, h1, q4); + __m128d q5 = _mm_load_pd(&q[8]); + __m128d y5 = _mm_macc_pd(x5, h1, q5); + __m128d q6 = _mm_load_pd(&q[10]); + __m128d y6 = _mm_macc_pd(x6, h1, q6); +#else + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + __m128d q5 = _mm_load_pd(&q[8]); + __m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); + __m128d q6 = _mm_load_pd(&q[10]); + __m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); +#endif + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); + y2 = _mm_macc_pd(q2, h2, y2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_macc_pd(q3, h1, x3); + y3 = _mm_macc_pd(q3, h2, y3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_macc_pd(q4, h1, x4); + y4 = _mm_macc_pd(q4, h2, y4); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + x5 = _mm_macc_pd(q5, h1, x5); + y5 = _mm_macc_pd(q5, h2, y5); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + x6 = _mm_macc_pd(q6, h1, x6); + y6 = _mm_macc_pd(q6, h2, y6); +#else + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); + y5 = _mm_add_pd(y5, _mm_mul_pd(q5,h2)); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); + y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_macc_pd(q3, h1, x3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_macc_pd(q4, h1, x4); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + x5 = _mm_macc_pd(q5, h1, x5); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + x6 = _mm_macc_pd(q6, h1, x6); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [12 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + x4 = _mm_mul_pd(x4, h1); + x5 = _mm_mul_pd(x5, h1); + x6 = _mm_mul_pd(x6, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); + y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); + y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2)); + y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2)); + y5 = _mm_macc_pd(y5, h1, _mm_mul_pd(x5,h2)); + y6 = _mm_macc_pd(y6, h1, _mm_mul_pd(x6,h2)); +#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); + y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); + y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2)); + y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2)); +#endif + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + q3 = _mm_load_pd(&q[4]); + q3 = _mm_add_pd(q3, y3); + _mm_store_pd(&q[4],q3); + q4 = _mm_load_pd(&q[6]); + q4 = _mm_add_pd(q4, y4); + _mm_store_pd(&q[6],q4); + q5 = _mm_load_pd(&q[8]); + q5 = _mm_add_pd(q5, y5); + _mm_store_pd(&q[8],q5); + q6 = _mm_load_pd(&q[10]); + q6 = _mm_add_pd(q6, y6); + _mm_store_pd(&q[10],q6); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3)); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4)); + _mm_store_pd(&q[ldq+6],q4); + q5 = _mm_load_pd(&q[ldq+8]); + q5 = _mm_add_pd(q5, _mm_macc_pd(y5, h2, x5)); + _mm_store_pd(&q[ldq+8],q5); + q6 = _mm_load_pd(&q[ldq+10]); + q6 = _mm_add_pd(q6, _mm_macc_pd(y6, h2, x6)); + _mm_store_pd(&q[ldq+10],q6); +#else + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[ldq+6],q4); + q5 = _mm_load_pd(&q[ldq+8]); + q5 = _mm_add_pd(q5, _mm_add_pd(x5, _mm_mul_pd(y5, h2))); + _mm_store_pd(&q[ldq+8],q5); + q6 = _mm_load_pd(&q[ldq+10]); + q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2))); + _mm_store_pd(&q[ldq+10],q6); +#endif + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + q5 = _mm_add_pd(q5, _mm_macc_pd(x5, h1, _mm_mul_pd(y5, h2))); + _mm_store_pd(&q[(i*ldq)+8],q5); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + q6 = _mm_add_pd(q6, _mm_macc_pd(x6, h1, _mm_mul_pd(y6, h2))); + _mm_store_pd(&q[(i*ldq)+10],q6); +#else + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + q5 = _mm_add_pd(q5, _mm_add_pd(_mm_mul_pd(x5,h1), _mm_mul_pd(y5, h2))); + _mm_store_pd(&q[(i*ldq)+8],q5); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2))); + _mm_store_pd(&q[(i*ldq)+10],q6); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_macc_pd(x1, h1, q1); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_macc_pd(x2, h1, q2); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_macc_pd(x3, h1, q3); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_macc_pd(x4, h1, q4); + _mm_store_pd(&q[(nb*ldq)+6],q4); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + q5 = _mm_macc_pd(x5, h1, q5); + _mm_store_pd(&q[(nb*ldq)+8],q5); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + q6 = _mm_macc_pd(x6, h1, q6); + _mm_store_pd(&q[(nb*ldq)+10],q6); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + _mm_store_pd(&q[(nb*ldq)+6],q4); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + q5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); + _mm_store_pd(&q[(nb*ldq)+8],q5); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); + _mm_store_pd(&q[(nb*ldq)+10],q6); +#endif +} + +/** + * Unrolled kernel that computes + * 8 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ +__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + __m128d x3 = _mm_load_pd(&q[ldq+4]); + __m128d x4 = _mm_load_pd(&q[ldq+6]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + +#ifdef __ELPA_USE_FMA__ + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_macc_pd(x1, h1, q1); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_macc_pd(x2, h1, q2); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_macc_pd(x3, h1, q3); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_macc_pd(x4, h1, q4); +#else + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); +#endif + + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); + y2 = _mm_macc_pd(q2, h2, y2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_macc_pd(q3, h1, x3); + y3 = _mm_macc_pd(q3, h2, y3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_macc_pd(q4, h1, x4); + y4 = _mm_macc_pd(q4, h2, y4); +#else + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_macc_pd(q3, h1, x3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_macc_pd(q4, h1, x4); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [8 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + x4 = _mm_mul_pd(x4, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); + y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); + y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2)); + y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2)); +#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); + y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); +#endif + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + q3 = _mm_load_pd(&q[4]); + q3 = _mm_add_pd(q3, y3); + _mm_store_pd(&q[4],q3); + q4 = _mm_load_pd(&q[6]); + q4 = _mm_add_pd(q4, y4); + _mm_store_pd(&q[6],q4); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3)); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4)); + _mm_store_pd(&q[ldq+6],q4); +#else + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[ldq+6],q4); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); +#else + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_macc_pd(x1, h1, q1); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_macc_pd(x2, h1, q2); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_macc_pd(x3, h1, q3); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_macc_pd(x4, h1, q4); + _mm_store_pd(&q[(nb*ldq)+6],q4); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + _mm_store_pd(&q[(nb*ldq)+6],q4); +#endif +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + +#ifdef __ELPA_USE_FMA__ + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_macc_pd(x1, h1, q1); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_macc_pd(x2, h1, q2); +#else + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); +#endif + + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); + y2 = _mm_macc_pd(q2, h2, y2); +#else + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_macc_pd(q1, h1, x1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_macc_pd(q2, h1, x2); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [12 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); + y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); +#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); +#endif + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); + _mm_store_pd(&q[ldq+2],q2); +#else + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); +#else + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_macc_pd(x1, h1, q1); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_macc_pd(x2, h1, q2); + _mm_store_pd(&q[(nb*ldq)+2],q2); +#else + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); +#endif +} diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c new file mode 100644 index 00000000..109a0004 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c @@ -0,0 +1,1302 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); + +void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + +// printf("s_1_2: %f\n", s_1_2); +// printf("s_1_3: %f\n", s_1_3); +// printf("s_2_3: %f\n", s_2_3); +// printf("s_1_4: %f\n", s_1_4); +// printf("s_2_4: %f\n", s_2_4); +// printf("s_3_4: %f\n", s_3_4); + + // Production level kernel calls with padding + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 2) + { + hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + else + { + hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + } +} + +#if 0 +void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#else + for (i = 0; i < nq; i+=6) + { + hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#endif +} +#endif +/** + * Unrolled kernel that computes + * 6 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [6 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); + w1 = _mm_macc_pd(a2_1, h_4_2, w1); + w1 = _mm_macc_pd(a1_1, h_4_1, w1); + register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); + z1 = _mm_macc_pd(a1_1, h_3_1, z1); + register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); + register __m128d x1 = a1_1; +#else + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + register __m128d x1 = a1_1; +#endif + + __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a3_2 = _mm_load_pd(&q[ldq+2]); + __m128d a4_2 = _mm_load_pd(&q[0+2]); + +#ifdef __ELPA_USE_FMA__ + register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); + w2 = _mm_macc_pd(a2_2, h_4_2, w2); + w2 = _mm_macc_pd(a1_2, h_4_1, w2); + register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); + z2 = _mm_macc_pd(a1_2, h_3_1, z2); + register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); + register __m128d x2 = a1_2; +#else + register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); + register __m128d x2 = a1_2; +#endif + + __m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]); + __m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]); + __m128d a3_3 = _mm_load_pd(&q[ldq+4]); + __m128d a4_3 = _mm_load_pd(&q[0+4]); + +#ifdef __ELPA_USE_FMA__ + register __m128d w3 = _mm_macc_pd(a3_3, h_4_3, a4_3); + w3 = _mm_macc_pd(a2_3, h_4_2, w3); + w3 = _mm_macc_pd(a1_3, h_4_1, w3); + register __m128d z3 = _mm_macc_pd(a2_3, h_3_2, a3_3); + z3 = _mm_macc_pd(a1_3, h_3_1, z3); + register __m128d y3 = _mm_macc_pd(a1_3, h_2_1, a2_3); + register __m128d x3 = a1_3; +#else + register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3)); + w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2)); + w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1)); + register __m128d z3 = _mm_add_pd(a3_3, _mm_mul_pd(a2_3, h_3_2)); + z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1)); + register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1)); + register __m128d x3 = a1_3; +#endif + + __m128d q1; + __m128d q2; + __m128d q3; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + x3 = _mm_macc_pd(q3, h1, x3); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); +#endif + + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); + y3 = _mm_macc_pd(q3, h2, y3); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); +#endif + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); + z3 = _mm_macc_pd(q3, h3, z3); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); +#endif + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); + w2 = _mm_macc_pd(q2, h4, w2); + w3 = _mm_macc_pd(q3, h4, w3); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + x3 = _mm_macc_pd(q3, h1, x3); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); +#endif + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); + y3 = _mm_macc_pd(q3, h2, y3); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); +#endif + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); + z3 = _mm_macc_pd(q3, h3, z3); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-2]); + + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + x3 = _mm_macc_pd(q3, h1, x3); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); +#endif + + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); + y3 = _mm_macc_pd(q3, h2, y3); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + x3 = _mm_macc_pd(q3, h1, x3); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [6 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); + y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2)); + y3 = _mm_msub_pd(y3, h1, _mm_mul_pd(x3,h2)); +#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); +#endif + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); + z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); + z3 = _mm_msub_pd(z3, h1, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2))); +#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); + z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))); +#endif + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); + w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); + w3 = _mm_msub_pd(w3, h1, _mm_macc_pd(z3, h4, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2)))); +#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); + w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)))); +#endif + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q3 = _mm_load_pd(&q[4]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); + q3 = _mm_sub_pd(q3, w3); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + _mm_store_pd(&q[4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[ldq+2]); + q3 = _mm_load_pd(&q[ldq+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); + q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2)); + q3 = _mm_sub_pd(q3, _mm_macc_pd(w3, h4, z3)); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); + q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); + q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4))); +#endif + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[ldq+2],q2); + _mm_store_pd(&q[ldq+4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); + q3 = _mm_load_pd(&q[(ldq*2)+4]); + q1 = _mm_sub_pd(q1, y1); + q2 = _mm_sub_pd(q2, y2); + q3 = _mm_sub_pd(q3, y3); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); + q3 = _mm_nmacc_pd(w3, h4, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); + q3 = _mm_nmacc_pd(z3, h3, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); +#endif + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + _mm_store_pd(&q[(ldq*2)+4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); + q3 = _mm_load_pd(&q[(ldq*3)+4]); + q1 = _mm_sub_pd(q1, x1); + q2 = _mm_sub_pd(q2, x2); + q3 = _mm_sub_pd(q3, x3); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); + q3 = _mm_nmacc_pd(w3, h4, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); + q3 = _mm_nmacc_pd(y3, h2, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); + q3 = _mm_nmacc_pd(z3, h3, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); +#endif + _mm_store_pd(&q[ldq*3], q1); + _mm_store_pd(&q[(ldq*3)+2], q2); + _mm_store_pd(&q[(ldq*3)+4], q3); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); + q3 = _mm_nmacc_pd(x3, h1, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1)); +#endif + + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); + q3 = _mm_nmacc_pd(y3, h2, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2)); +#endif + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); + q3 = _mm_nmacc_pd(z3, h3, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3)); +#endif + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); + q3 = _mm_nmacc_pd(w3, h4, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4)); +#endif + + _mm_store_pd(&q[i*ldq],q1); + _mm_store_pd(&q[(i*ldq)+2],q2); + _mm_store_pd(&q[(i*ldq)+4],q3); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); + q3 = _mm_nmacc_pd(x3, h1, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); + q3 = _mm_nmacc_pd(y3, h2, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); + q3 = _mm_nmacc_pd(z3, h3, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); +#endif + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + _mm_store_pd(&q[(nb*ldq)+4],q3); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); + q3 = _mm_nmacc_pd(x3, h1, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); + q3 = _mm_nmacc_pd(y3, h2, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); +#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + _mm_store_pd(&q[((nb+1)*ldq)+4],q3); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); + q3 = _mm_nmacc_pd(x3, h1, q3); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); +#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); + _mm_store_pd(&q[((nb+2)*ldq)+4],q3); +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); + w1 = _mm_macc_pd(a2_1, h_4_2, w1); + w1 = _mm_macc_pd(a1_1, h_4_1, w1); + __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); + z1 = _mm_macc_pd(a1_1, h_3_1, z1); + __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); + __m128d x1 = a1_1; +#else + __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + __m128d x1 = a1_1; +#endif + + __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a3_2 = _mm_load_pd(&q[ldq+2]); + __m128d a4_2 = _mm_load_pd(&q[0+2]); + +#ifdef __ELPA_USE_FMA__ + __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); + w2 = _mm_macc_pd(a2_2, h_4_2, w2); + w2 = _mm_macc_pd(a1_2, h_4_1, w2); + __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); + z2 = _mm_macc_pd(a1_2, h_3_1, z2); + __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); + __m128d x2 = a1_2; +#else + __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); + __m128d x2 = a1_2; +#endif + + __m128d q1; + __m128d q2; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + z1 = _mm_macc_pd(q1, h3, z1); + w1 = _mm_macc_pd(q1, h4, w1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); +#endif + + q2 = _mm_load_pd(&q[(i*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_macc_pd(q2, h1, x2); + y2 = _mm_macc_pd(q2, h2, y2); + z2 = _mm_macc_pd(q2, h3, z2); + w2 = _mm_macc_pd(q2, h4, w2); +#else + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); + + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); + y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2)); +#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); +#endif + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); + z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); +#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); +#endif + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); + w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); +#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); +#endif + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[ldq+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); + q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2)); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); + q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); +#endif + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[ldq+2],q2); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))); + q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); + q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))); +#endif + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))))); + q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_macc_pd(y2, h2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4))))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); + q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))))); +#endif + _mm_store_pd(&q[ldq*3], q1); + _mm_store_pd(&q[(ldq*3)+2], q2); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); +#endif + _mm_store_pd(&q[i*ldq],q1); + + q2 = _mm_load_pd(&q[(i*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_macc_pd(w2, h4, _mm_mul_pd(z2, h3)), _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2)))); +#else + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)))); +#endif + _mm_store_pd(&q[(i*ldq)+2],q2); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2)))); + q2 = _mm_sub_pd(q2, _mm_macc_pd(x2, h1, _mm_macc_pd(z2, h3, _mm_mul_pd(y2, h2)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1))); +#endif + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1))); + q2 = _mm_sub_pd(q2, _mm_macc_pd(y2, h2, _mm_mul_pd(x2, h1))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); + q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1))); +#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); +} + +/** + * Unrolled kernel that computes + * 2 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [2 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); + w1 = _mm_macc_pd(a2_1, h_4_2, w1); + w1 = _mm_macc_pd(a1_1, h_4_1, w1); + __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); + z1 = _mm_macc_pd(a1_1, h_3_1, z1); + __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); + __m128d x1 = a1_1; +#else + __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + __m128d x1 = a1_1; +#endif + + __m128d q1; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + z1 = _mm_macc_pd(q1, h3, z1); + w1 = _mm_macc_pd(q1, h4, w1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); + z1 = _mm_macc_pd(q1, h3, z1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + y1 = _mm_macc_pd(q1, h2, y1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + ///////////////////////////////////////////////////// + // Rank-1 update of Q [2 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); +#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); +#endif + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); +#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); +#endif + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); +#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); +#endif + + q1 = _mm_load_pd(&q[0]); + q1 = _mm_sub_pd(q1, w1); + _mm_store_pd(&q[0],q1); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); +#endif + _mm_store_pd(&q[ldq],q1); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); +#endif + _mm_store_pd(&q[ldq*2],q1); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); +#endif + _mm_store_pd(&q[ldq*3], q1); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); +#endif + _mm_store_pd(&q[i*ldq],q1); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2)))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); +#endif + _mm_store_pd(&q[nb*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1))); +#else + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); +#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); +} diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c new file mode 100644 index 00000000..f9cacbc8 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c @@ -0,0 +1,1729 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); + +void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding + for (i = 0; i < nq-2; i+=4) + { + hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + } + if (nq == i) + { + return; + } + else + { + hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + } +} + +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=8) + { + hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#else + for (i = 0; i < nq; i+=4) + { + hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#endif +} +#endif + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*5]); + __m128d a2_1 = _mm_load_pd(&q[ldq*4]); + __m128d a3_1 = _mm_load_pd(&q[ldq*3]); + __m128d a4_1 = _mm_load_pd(&q[ldq*2]); + __m128d a5_1 = _mm_load_pd(&q[ldq]); + __m128d a6_1 = _mm_load_pd(&q[0]); + + __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1); + t1 = _mm_macc_pd(a4_1, h_6_4, t1); + t1 = _mm_macc_pd(a3_1, h_6_3, t1); + t1 = _mm_macc_pd(a2_1, h_6_2, t1); + t1 = _mm_macc_pd(a1_1, h_6_1, t1); +#else + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); +#endif + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1); + v1 = _mm_macc_pd(a3_1, h_5_3, v1); + v1 = _mm_macc_pd(a2_1, h_5_2, v1); + v1 = _mm_macc_pd(a1_1, h_5_1, v1); +#else + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); +#endif + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); + w1 = _mm_macc_pd(a2_1, h_4_2, w1); + w1 = _mm_macc_pd(a1_1, h_4_1, w1); +#else + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); +#endif + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); + z1 = _mm_macc_pd(a1_1, h_3_1, z1); + register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); +#else + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); +#endif + register __m128d x1 = a1_1; + + __m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]); + __m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a5_2 = _mm_load_pd(&q[(ldq)+2]); + __m128d a6_2 = _mm_load_pd(&q[2]); + +#ifdef __ELPA_USE_FMA__ + register __m128d t2 = _mm_macc_pd(a5_2, h_6_5, a6_2); + t2 = _mm_macc_pd(a4_2, h_6_4, t2); + t2 = _mm_macc_pd(a3_2, h_6_3, t2); + t2 = _mm_macc_pd(a2_2, h_6_2, t2); + t2 = _mm_macc_pd(a1_2, h_6_1, t2); + register __m128d v2 = _mm_macc_pd(a4_2, h_5_4, a5_2); + v2 = _mm_macc_pd(a3_2, h_5_3, v2); + v2 = _mm_macc_pd(a2_2, h_5_2, v2); + v2 = _mm_macc_pd(a1_2, h_5_1, v2); + register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); + w2 = _mm_macc_pd(a2_2, h_4_2, w2); + w2 = _mm_macc_pd(a1_2, h_4_1, w2); + register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); + z2 = _mm_macc_pd(a1_2, h_3_1, z2); + register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); +#else + register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1)); + register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1)); + register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); +#endif + register __m128d x2 = a1_2; + + __m128d q1; + __m128d q2; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + __m128d h5; + __m128d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-5]); + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); + w2 = _mm_macc_pd(q2, h4, w2); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_macc_pd(q1, h5, v1); + v2 = _mm_macc_pd(q2, h5, v2); +#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + t1 = _mm_macc_pd(q1, h6, t1); + t2 = _mm_macc_pd(q2, h6, t2); +#else + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); + t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); + w2 = _mm_macc_pd(q2, h4, w2); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_macc_pd(q1, h5, v1); + v2 = _mm_macc_pd(q2, h5, v2); +#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); + w2 = _mm_macc_pd(q2, h4, w2); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); + z2 = _mm_macc_pd(q2, h3, z2); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); + y2 = _mm_macc_pd(q2, h2, y2); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); + x2 = _mm_macc_pd(q2, h1, x2); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + x1 = _mm_mul_pd(x1, tau1); + x2 = _mm_mul_pd(x2, tau1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); + h2 = _mm_mul_pd(tau2, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2)); + y2 = _mm_msub_pd(y2, tau2, _mm_mul_pd(x2,h2)); +#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2)); +#endif + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); + __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); + h2 = _mm_mul_pd(tau3, vs_1_3); + h3 = _mm_mul_pd(tau3, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); + z2 = _mm_msub_pd(z2, tau3, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); +#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); +#endif + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); + __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); + h2 = _mm_mul_pd(tau4, vs_1_4); + h3 = _mm_mul_pd(tau4, vs_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); + h4 = _mm_mul_pd(tau4, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); + w2 = _mm_msub_pd(w2, tau4, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); +#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); +#endif + + __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); + __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); + __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); + h2 = _mm_mul_pd(tau5, vs_1_5); + h3 = _mm_mul_pd(tau5, vs_2_5); + __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); + __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); + h4 = _mm_mul_pd(tau5, vs_3_5); + h5 = _mm_mul_pd(tau5, vs_4_5); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); + v2 = _mm_msub_pd(v2, tau5, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); +#else + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); +#endif + + __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); + __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); + __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); + h2 = _mm_mul_pd(tau6, vs_1_6); + h3 = _mm_mul_pd(tau6, vs_2_6); + __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); + __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); + __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); + h4 = _mm_mul_pd(tau6, vs_3_6); + h5 = _mm_mul_pd(tau6, vs_4_6); + h6 = _mm_mul_pd(tau6, vs_5_6); +#ifdef __ELPA_USE_FMA__ + t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))))); + t2 = _mm_msub_pd(t2, tau6, _mm_macc_pd(v2, h6, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))))); +#else + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); + t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))))); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q1 = _mm_sub_pd(q1, t1); + q2 = _mm_sub_pd(q2, t2); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[(ldq+2)]); + q1 = _mm_sub_pd(q1, v1); + q2 = _mm_sub_pd(q2, v2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[(ldq+2)],q2); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); + q1 = _mm_sub_pd(q1, z1); + q2 = _mm_sub_pd(q2, z2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[ldq*3],q1); + _mm_store_pd(&q[(ldq*3)+2],q2); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + q1 = _mm_load_pd(&q[ldq*4]); + q2 = _mm_load_pd(&q[(ldq*4)+2]); + q1 = _mm_sub_pd(q1, y1); + q2 = _mm_sub_pd(q2, y2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[ldq*4],q1); + _mm_store_pd(&q[(ldq*4)+2],q2); + + h2 = _mm_loaddup_pd(&hh[(ldh)+1]); + q1 = _mm_load_pd(&q[ldq*5]); + q2 = _mm_load_pd(&q[(ldq*5)+2]); + q1 = _mm_sub_pd(q1, x1); + q2 = _mm_sub_pd(q2, x2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[ldq*5],q1); + _mm_store_pd(&q[(ldq*5)+2],q2); + + for (i = 6; i < nb; i++) + { + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + h1 = _mm_loaddup_pd(&hh[i-5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); + q2 = _mm_nmacc_pd(t2, h6, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); +#endif + _mm_store_pd(&q[i*ldq],q1); + _mm_store_pd(&q[(i*ldq)+2],q2); + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); + q2 = _mm_nmacc_pd(v2, h5, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); +#endif + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); + q2 = _mm_nmacc_pd(w2, h4, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); +#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); + q2 = _mm_nmacc_pd(z2, h3, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); +#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); + q2 = _mm_nmacc_pd(y2, h2, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); +#endif + _mm_store_pd(&q[(nb+3)*ldq],q1); + _mm_store_pd(&q[((nb+3)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); + q2 = _mm_nmacc_pd(x2, h1, q2); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); +#endif + _mm_store_pd(&q[(nb+4)*ldq],q1); + _mm_store_pd(&q[((nb+4)*ldq)+2],q2); +} + +/** + * Unrolled kernel that computes + * 2 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [2 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*5]); + __m128d a2_1 = _mm_load_pd(&q[ldq*4]); + __m128d a3_1 = _mm_load_pd(&q[ldq*3]); + __m128d a4_1 = _mm_load_pd(&q[ldq*2]); + __m128d a5_1 = _mm_load_pd(&q[ldq]); + __m128d a6_1 = _mm_load_pd(&q[0]); + + __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1); + t1 = _mm_macc_pd(a4_1, h_6_4, t1); + t1 = _mm_macc_pd(a3_1, h_6_3, t1); + t1 = _mm_macc_pd(a2_1, h_6_2, t1); + t1 = _mm_macc_pd(a1_1, h_6_1, t1); +#else + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); +#endif + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1); + v1 = _mm_macc_pd(a3_1, h_5_3, v1); + v1 = _mm_macc_pd(a2_1, h_5_2, v1); + v1 = _mm_macc_pd(a1_1, h_5_1, v1); +#else + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); +#endif + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); + w1 = _mm_macc_pd(a2_1, h_4_2, w1); + w1 = _mm_macc_pd(a1_1, h_4_1, w1); +#else + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); +#endif + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); + z1 = _mm_macc_pd(a1_1, h_3_1, z1); + register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); +#else + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); +#endif + register __m128d x1 = a1_1; + + __m128d q1; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + __m128d h5; + __m128d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-5]); + q1 = _mm_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_macc_pd(q1, h5, v1); +#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + t1 = _mm_macc_pd(q1, h6, t1); +#else + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); +#endif + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_macc_pd(q1, h5, v1); +#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_macc_pd(q1, h4, w1); +#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_macc_pd(q1, h3, z1); +#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_macc_pd(q1, h2, y1); +#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); +#endif + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_macc_pd(q1, h1, x1); +#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); +#endif + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + x1 = _mm_mul_pd(x1, tau1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); + h2 = _mm_mul_pd(tau2, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2)); +#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); +#endif + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); + __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); + h2 = _mm_mul_pd(tau3, vs_1_3); + h3 = _mm_mul_pd(tau3, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); +#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); +#endif + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); + __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); + h2 = _mm_mul_pd(tau4, vs_1_4); + h3 = _mm_mul_pd(tau4, vs_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); + h4 = _mm_mul_pd(tau4, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); +#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); +#endif + + __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); + __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); + __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); + h2 = _mm_mul_pd(tau5, vs_1_5); + h3 = _mm_mul_pd(tau5, vs_2_5); + __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); + __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); + h4 = _mm_mul_pd(tau5, vs_3_5); + h5 = _mm_mul_pd(tau5, vs_4_5); +#ifdef __ELPA_USE_FMA__ + v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); +#else + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); +#endif + + __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); + __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); + __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); + h2 = _mm_mul_pd(tau6, vs_1_6); + h3 = _mm_mul_pd(tau6, vs_2_6); + __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); + __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); + __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); + h4 = _mm_mul_pd(tau6, vs_3_6); + h5 = _mm_mul_pd(tau6, vs_4_6); + h6 = _mm_mul_pd(tau6, vs_5_6); +#ifdef __ELPA_USE_FMA__ + t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))))); +#else + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [2 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm_load_pd(&q[0]); + q1 = _mm_sub_pd(q1, t1); + _mm_store_pd(&q[0],q1); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_sub_pd(q1, v1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[ldq],q1); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + q1 = _mm_load_pd(&q[ldq*2]); + q1 = _mm_sub_pd(q1, w1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[ldq*2],q1); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq*3]); + q1 = _mm_sub_pd(q1, z1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[ldq*3],q1); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + q1 = _mm_load_pd(&q[ldq*4]); + q1 = _mm_sub_pd(q1, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[ldq*4],q1); + + h2 = _mm_loaddup_pd(&hh[(ldh)+1]); + q1 = _mm_load_pd(&q[ldq*5]); + q1 = _mm_sub_pd(q1, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[ldq*5],q1); + + for (i = 6; i < nb; i++) + { + q1 = _mm_load_pd(&q[i*ldq]); + h1 = _mm_loaddup_pd(&hh[i-5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(t1, h6, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); +#endif + _mm_store_pd(&q[i*ldq],q1); + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(v1, h5, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); +#endif + _mm_store_pd(&q[nb*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(w1, h4, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); +#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(z1, h3, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); +#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(y1, h2, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); +#endif + _mm_store_pd(&q[(nb+3)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_nmacc_pd(x1, h1, q1); +#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); +#endif + _mm_store_pd(&q[(nb+4)*ldq],q1); +} diff --git a/src/elpa2_utilities.F90 b/src/elpa2_utilities.F90 index fedc513f..70c98173 100644 --- a/src/elpa2_utilities.F90 +++ b/src/elpa2_utilities.F90 @@ -71,13 +71,16 @@ module ELPA2_utilities public :: get_actual_real_kernel_name, get_actual_complex_kernel_name public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, & REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, & - REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, & + REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, & + REAL_ELPA_KERNEL_AVX_BLOCK2, & + REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, & REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6 public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & - COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, & - COMPLEX_ELPA_KERNEL_AVX_BLOCK2 + COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & + COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & + COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2 public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES @@ -97,6 +100,9 @@ module ELPA2_utilities integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE + integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_REAL_KERNEL_SSE_BLOCK2 + integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4 = ELPA2_REAL_KERNEL_SSE_BLOCK4 + integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6 = ELPA2_REAL_KERNEL_SSE_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6 @@ -112,6 +118,9 @@ module ELPA2_utilities "REAL_ELPA_KERNEL_BGP ", & "REAL_ELPA_KERNEL_BGQ ", & "REAL_ELPA_KERNEL_SSE ", & + "REAL_ELPA_KERNEL_SSE_BLOCK2 ", & + "REAL_ELPA_KERNEL_SSE_BLOCK4 ", & + "REAL_ELPA_KERNEL_SSE_BLOCK6 ", & "REAL_ELPA_KERNEL_AVX_BLOCK2 ", & "REAL_ELPA_KERNEL_AVX_BLOCK4 ", & "REAL_ELPA_KERNEL_AVX_BLOCK6 "/) @@ -122,6 +131,8 @@ module ELPA2_utilities integer, parameter :: COMPLEX_ELPA_KERNEL_BGP = ELPA2_COMPLEX_KERNEL_BGP integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ = ELPA2_COMPLEX_KERNEL_BGQ integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE + integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK1 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 + integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 @@ -136,6 +147,8 @@ module ELPA2_utilities "COMPLEX_ELPA_KERNEL_BGP ", & "COMPLEX_ELPA_KERNEL_BGQ ", & "COMPLEX_ELPA_KERNEL_SSE ", & + "COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", & + "COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "/) @@ -167,20 +180,36 @@ module ELPA2_utilities #else ,0 & #endif -#if WITH_REAL_AVX_BLOCK2_KERNEL +#if WITH_REAL_SSE_BLOCK2_KERNEL ,1 & #else ,0 & #endif -#if WITH_REAL_AVX_BLOCK4_KERNEL +#if WITH_REAL_SSE_BLOCK4_KERNEL ,1 & #else ,0 & #endif -#if WITH_REAL_AVX_BLOCK6_KERNEL +#if WITH_REAL_SSE_BLOCK6_KERNEL ,1 & #else ,0 & + +#endif +#if WITH_REAL_AVX_BLOCK2_KERNEL + ,1 & +#else + ,0 & +#endif +#if WITH_REAL_AVX_BLOCK4_KERNEL + ,1 & +#else + ,0 & +#endif +#if WITH_REAL_AVX_BLOCK6_KERNEL + ,1 & +#else + ,0 & #endif /) @@ -212,16 +241,27 @@ module ELPA2_utilities #else ,0 & #endif -#if WITH_COMPLEX_AVX_BLOCK1_KERNEL +#if WITH_COMPLEX_SSE_BLOCK1_KERNEL ,1 & #else ,0 & #endif -#if WITH_COMPLEX_AVX_BLOCK2_KERNEL +#if WITH_COMPLEX_SSE_BLOCK2_KERNEL ,1 & #else ,0 & #endif + +#if WITH_COMPLEX_AVX_BLOCK1_KERNEL + ,1 & +#else + ,0 & +#endif +#if WITH_COMPLEX_AVX_BLOCK2_KERNEL + ,1 & +#else + ,0 & +#endif /) !****** diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90 index f524694e..82bb392d 100644 --- a/src/mod_compute_hh_trafo_complex.F90 +++ b/src/mod_compute_hh_trafo_complex.F90 @@ -90,6 +90,34 @@ module compute_hh_trafo_complex nl = merge(stripe_width, last_stripe_width, istripe