diff --git a/Makefile.am b/Makefile.am index 6df9044be128d79a6198a19317df185995b9ced2..18386e0d21b02b14fb344692fa00227443ac5dfc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -97,43 +97,73 @@ endif endif if WITH_REAL_SSE_BLOCK2_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c +endif endif if WITH_REAL_AVX_BLOCK2_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c +endif endif if WITH_REAL_SSE_BLOCK4_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c +endif endif if WITH_REAL_AVX_BLOCK4_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c +endif endif if WITH_REAL_SSE_BLOCK6_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c +endif endif if WITH_REAL_AVX_BLOCK6_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c +endif endif if WITH_COMPLEX_SSE_BLOCK1_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp +if WANT_SINGLE_PRECISION_COMPLEX + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp +endif endif if WITH_COMPLEX_AVX_BLOCK1_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp +if WANT_SINGLE_PRECISION_COMPLEX + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp +endif endif if WITH_COMPLEX_SSE_BLOCK2_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp +if WANT_SINGLE_PRECISION_COMPLEX + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp +endif endif if WITH_COMPLEX_AVX_BLOCK2_KERNEL - libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp +if WANT_SINGLE_PRECISION_COMPLEX + libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp +endif endif .cu.lo: diff --git a/src/elpa2.F90 b/src/elpa2.F90 index fe3d9a18e7b46319f2314ee5ed108b58c179e209..6dcd95492068c17af77cf12e91c281da06be54fe 100644 --- a/src/elpa2.F90 +++ b/src/elpa2.F90 @@ -304,6 +304,7 @@ contains if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & + (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then else print *,"At the moment single precision only works with the generic kernels" @@ -655,6 +656,7 @@ contains if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & + (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then else print *,"At the moment single precision only works with the generic kernels" diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp similarity index 93% rename from src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp rename to src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp index 08fae40caf2bf8f5d8d67957b8515e231749933c..b8df8ddb30cfe77b471dcd2cba30333c88ca71a6 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp @@ -85,12 +85,12 @@ extern "C" { //Forward declaration -static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); #if 0 -static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { std::complex x0; std::complex x1; @@ -139,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex* } #endif // if 0 -void single_hh_trafo_complex_avx_avx2_1hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) +void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) { int i; int nb = *pnb; @@ -149,19 +149,19 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex* q, std::complex for (i = 0; i < nq-8; i+=12) { - hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq); + hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq); } if (nq-i > 4) { - hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq); + hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq); } else if (nq-i > 0) { - hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq); + hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq); } } - static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq) + static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -356,7 +356,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex* q, std::complex } } -static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -501,7 +501,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex } } -static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f47a683b5983938324980cfdefc6e2b5df0022c4 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp @@ -0,0 +1,598 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_AVX2 + +#ifdef __FMA4__ +#define __ELPA_USE_FMA__ +#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) +#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c) +#endif + +#ifdef __AVX2__ +#define __ELPA_USE_FMA__ +#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c) +#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) +#endif + +#endif + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + std::complex x0; + std::complex x1; + std::complex x2; + std::complex x3; + std::complex h0; + std::complex tau0; + int i=0; + + x0 = q[0]; + x1 = q[1]; + x2 = q[2]; + x3 = q[3]; + + for (i = 1; i < nb; i++) + { + h0 = conj(hh[i]); + x0 += (q[(i*ldq)+0] * h0); + x1 += (q[(i*ldq)+1] * h0); + x2 += (q[(i*ldq)+2] * h0); + x3 += (q[(i*ldq)+3] * h0); + } + + tau0 = hh[0]; + + h0 = (-1.0)*tau0; + + x0 *= h0; + x1 *= h0; + x2 *= h0; + x3 *= h0; + + q[0] += x0; + q[1] += x1; + q[2] += x2; + q[3] += x3; + + for (i = 1; i < nb; i++) + { + h0 = hh[i]; + q[(i*ldq)+0] += (x0*h0); + q[(i*ldq)+1] += (x1*h0); + q[(i*ldq)+2] += (x2*h0); + q[(i*ldq)+3] += (x3*h0); + } +} +#endif // if 0 + +void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + //int ldh = *pldh; + + for (i = 0; i < nq-8; i+=12) + { + hh_trafo_complex_kernel_12_AVX_1hv_single(&q[i], hh, nb, ldq); + } + if (nq-i > 4) + { + hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq); + } +} + + static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m256d x1, x2, x3, x4, x5, x6; + __m256d q1, q2, q3, q4, q5, q6; + __m256d h1_real, h1_imag; + __m256d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[0]); + x2 = _mm256_load_pd(&q_dbl[4]); + x3 = _mm256_load_pd(&q_dbl[8]); + x4 = _mm256_load_pd(&q_dbl[12]); + x5 = _mm256_load_pd(&q_dbl[16]); + x6 = _mm256_load_pd(&q_dbl[20]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]); + q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + tmp5 = _mm256_mul_pd(h1_imag, q5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5))); +#else + x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5))); +#endif + tmp6 = _mm256_mul_pd(h1_imag, q6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5))); +#else + x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#else + x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#endif + tmp5 = _mm256_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)); +#else + x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)); +#endif + tmp6 = _mm256_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)); +#else + x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + q3 = _mm256_load_pd(&q_dbl[8]); + q4 = _mm256_load_pd(&q_dbl[12]); + q5 = _mm256_load_pd(&q_dbl[16]); + q6 = _mm256_load_pd(&q_dbl[20]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + q3 = _mm256_add_pd(q3, x3); + q4 = _mm256_add_pd(q4, x4); + q5 = _mm256_add_pd(q5, x5); + q6 = _mm256_add_pd(q6, x6); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + _mm256_store_pd(&q_dbl[8], q3); + _mm256_store_pd(&q_dbl[12], q4); + _mm256_store_pd(&q_dbl[16], q5); + _mm256_store_pd(&q_dbl[20], q6); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]); + q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + tmp5 = _mm256_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5))); +#else + q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5))); +#endif + tmp6 = _mm256_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5))); +#else + q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); + _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); + _mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5); + _mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6); + } +} + +static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m256d x1, x2, x3, x4; + __m256d q1, q2, q3, q4; + __m256d h1_real, h1_imag; + __m256d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[0]); + x2 = _mm256_load_pd(&q_dbl[4]); + x3 = _mm256_load_pd(&q_dbl[8]); + x4 = _mm256_load_pd(&q_dbl[12]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#else + x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + q3 = _mm256_load_pd(&q_dbl[8]); + q4 = _mm256_load_pd(&q_dbl[12]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + q3 = _mm256_add_pd(q3, x3); + q4 = _mm256_add_pd(q4, x4); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + _mm256_store_pd(&q_dbl[8], q3); + _mm256_store_pd(&q_dbl[12], q4); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); + _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); + } +} + +static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m256d x1, x2; + __m256d q1, q2; + __m256d h1_real, h1_imag; + __m256d tmp1, tmp2; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[0]); + x2 = _mm256_load_pd(&q_dbl[4]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + + for (i = 1; i < nb; i++) + { + h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + } +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp similarity index 96% rename from src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp rename to src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp index 8a725406d0153231af9b64ac143f1273f26c9c27..4535bd11efbf18725a2a77f129a11df1df059e9e 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp @@ -85,13 +85,13 @@ extern "C" { //Forward declaration -static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); #if 0 -static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { std::complex x1; std::complex x2; @@ -188,7 +188,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex* } #endif -void double_hh_trafo_complex_avx_avx2_2hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -205,29 +205,29 @@ void double_hh_trafo_complex_avx_avx2_2hv_(std::complex* q, std::complex #if 1 for (i = 0; i < nq-4; i+=8) { - hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } if (nq-i > 0) { - hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #else for (i = 0; i < nq-4; i+=6) { - hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } if (nq-i > 2) { - hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { - hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #endif } -static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -660,7 +660,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); } -static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -1013,7 +1013,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); } -static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -1286,7 +1286,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); } -static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; diff --git a/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb989450520cf2e3838c23aae35947aca2272e51 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp @@ -0,0 +1,1481 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_AVX2 + +#ifdef __FMA4__ +#define __ELPA_USE_FMA__ +#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) +#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c) +#endif + +#ifdef __AVX2__ +#define __ELPA_USE_FMA__ +#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c) +#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) +#endif + +#endif + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + std::complex x1; + std::complex x2; + std::complex x3; + std::complex x4; + std::complex y1; + std::complex y2; + std::complex y3; + std::complex y4; + std::complex h1; + std::complex h2; + std::complex tau1; + std::complex tau2; + int i=0; + + x1 = q[ldq+0]; + x2 = q[ldq+1]; + x3 = q[ldq+2]; + x4 = q[ldq+3]; + + h2 = conj(hh[ldh+1]); + + y1 = q[0] + (x1*h2); + y2 = q[1] + (x2*h2); + y3 = q[2] + (x3*h2); + y4 = q[3] + (x4*h2); + + for (i = 2; i < nb; i++) + { + h1 = conj(hh[i-1]); + h2 = conj(hh[ldh+i]); + + x1 += (q[(i*ldq)+0] * h1); + y1 += (q[(i*ldq)+0] * h2); + x2 += (q[(i*ldq)+1] * h1); + y2 += (q[(i*ldq)+1] * h2); + x3 += (q[(i*ldq)+2] * h1); + y3 += (q[(i*ldq)+2] * h2); + x4 += (q[(i*ldq)+3] * h1); + y4 += (q[(i*ldq)+3] * h2); + } + h1 = conj(hh[nb-1]); + + x1 += (q[(nb*ldq)+0] * h1); + x2 += (q[(nb*ldq)+1] * h1); + x3 += (q[(nb*ldq)+2] * h1); + x4 += (q[(nb*ldq)+3] * h1); + + tau1 = hh[0]; + tau2 = hh[ldh]; + + h1 = (-1.0)*tau1; + + x1 *= h1; + x2 *= h1; + x3 *= h1; + x4 *= h1; + + h1 = (-1.0)*tau2; + h2 = (-1.0)*tau2; + h2 *= s; + y1 = y1*h1 +x1*h2; + y2 = y2*h1 +x2*h2; + y3 = y3*h1 +x3*h2; + y4 = y4*h1 +x4*h2; + + q[0] += y1; + q[1] += y2; + q[2] += y3; + q[3] += y4; + + h2 = hh[ldh+1]; + q[ldq+0] += (x1 + (y1*h2)); + q[ldq+1] += (x2 + (y2*h2)); + q[ldq+2] += (x3 + (y3*h2)); + q[ldq+3] += (x4 + (y4*h2)); + + for (i = 2; i < nb; i++) + { + h1 = hh[i-1]; + h2 = hh[ldh+i]; + + q[(i*ldq)+0] += ((x1*h1) + (y1*h2)); + q[(i*ldq)+1] += ((x2*h1) + (y2*h2)); + q[(i*ldq)+2] += ((x3*h1) + (y3*h2)); + q[(i*ldq)+3] += ((x4*h1) + (y4*h2)); + } + + h1 = hh[nb-1]; + q[(nb*ldq)+0] += (x1*h1); + q[(nb*ldq)+1] += (x2*h1); + q[(nb*ldq)+2] += (x3*h1); + q[(nb*ldq)+3] += (x4*h1); +} +#endif + +void double_hh_trafo_complex_avx_avx2_2hv_single_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + std::complex s = conj(hh[(ldh)+1])*1.0; + for (i = 2; i < nb; i++) + { + s += hh[i-1] * conj(hh[(i+ldh)]); + } + +#if 1 + for (i = 0; i < nq-4; i+=8) + { + hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + if (nq-i > 0) + { + hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_complex_kernel_6_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + if (nq-i > 2) + { + hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_2_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} + +static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m256d x1, x2, x3, x4; + __m256d y1, y2, y3, y4; + __m256d q1, q2, q3, q4; + __m256d h1_real, h1_imag, h2_real, h2_imag; + __m256d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); + x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); + x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + y1 = _mm256_load_pd(&q_dbl[0]); + y2 = _mm256_load_pd(&q_dbl[4]); + y3 = _mm256_load_pd(&q_dbl[8]); + y4 = _mm256_load_pd(&q_dbl[12]); + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h2_imag, q4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#else + x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + h2_real = _mm256_xor_pd(h2_real, sign); + h2_imag = _mm256_xor_pd(h2_imag, sign); + + __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); + tmp2 = _mm256_broadcast_pd(&tmp_s_128); + tmp1 = _mm256_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); + h2_real = _mm256_broadcast_sd(&s_dbl[0]); + h2_imag = _mm256_broadcast_sd(&s_dbl[1]); + + tmp1 = _mm256_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + tmp4 = _mm256_mul_pd(h1_imag, y4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#else + y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + q3 = _mm256_load_pd(&q_dbl[8]); + q4 = _mm256_load_pd(&q_dbl[12]); + + q1 = _mm256_add_pd(q1, y1); + q2 = _mm256_add_pd(q2, y2); + q3 = _mm256_add_pd(q3, y3); + q4 = _mm256_add_pd(q4, y4); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + _mm256_store_pd(&q_dbl[8], q3); + _mm256_store_pd(&q_dbl[12], q4); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); + q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); + q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + q3 = _mm256_add_pd(q3, x3); + q4 = _mm256_add_pd(q4, x4); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __FMA4_ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); + _mm256_store_pd(&q_dbl[(ldq*2)+8], q3); + _mm256_store_pd(&q_dbl[(ldq*2)+12], q4); + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); + _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); + } + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); + q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + tmp4 = _mm256_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#else + q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); +} + +static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m256d x1, x2, x3; + __m256d y1, y2, y3; + __m256d q1, q2, q3; + __m256d h1_real, h1_imag, h2_real, h2_imag; + __m256d tmp1, tmp2, tmp3; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); + x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + y1 = _mm256_load_pd(&q_dbl[0]); + y2 = _mm256_load_pd(&q_dbl[4]); + y3 = _mm256_load_pd(&q_dbl[8]); + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + h2_real = _mm256_xor_pd(h2_real, sign); + h2_imag = _mm256_xor_pd(h2_imag, sign); + + __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); + tmp2 = _mm256_broadcast_pd(&tmp_s_128); + tmp1 = _mm256_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); + h2_real = _mm256_broadcast_sd(&s_dbl[0]); + h2_imag = _mm256_broadcast_sd(&s_dbl[1]); + + tmp1 = _mm256_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + tmp3 = _mm256_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#else + y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + q3 = _mm256_load_pd(&q_dbl[8]); + + q1 = _mm256_add_pd(q1, y1); + q2 = _mm256_add_pd(q2, y2); + q3 = _mm256_add_pd(q3, y3); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + _mm256_store_pd(&q_dbl[8], q3); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); + q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + q3 = _mm256_add_pd(q3, x3); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __FMA4_ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); + _mm256_store_pd(&q_dbl[(ldq*2)+8], q3); + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); + } + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + tmp3 = _mm256_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#else + q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); +} + +static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m256d x1, x2; + __m256d y1, y2; + __m256d q1, q2; + __m256d h1_real, h1_imag, h2_real, h2_imag; + __m256d tmp1, tmp2; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + y1 = _mm256_load_pd(&q_dbl[0]); + y2 = _mm256_load_pd(&q_dbl[4]); + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + h2_real = _mm256_xor_pd(h2_real, sign); + h2_imag = _mm256_xor_pd(h2_imag, sign); + + __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); + tmp2 = _mm256_broadcast_pd(&tmp_s_128); + tmp1 = _mm256_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); + h2_real = _mm256_broadcast_sd(&s_dbl[0]); + h2_imag = _mm256_broadcast_sd(&s_dbl[1]); + + tmp1 = _mm256_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + tmp2 = _mm256_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#else + y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + q2 = _mm256_load_pd(&q_dbl[4]); + + q1 = _mm256_add_pd(q1, y1); + q2 = _mm256_add_pd(q2, y2); + + _mm256_store_pd(&q_dbl[0], q1); + _mm256_store_pd(&q_dbl[4], q2); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); + + q1 = _mm256_add_pd(q1, x1); + q2 = _mm256_add_pd(q2, x2); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __FMA4_ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); + } + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + tmp2 = _mm256_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#else + q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); +} + +static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m256d x1; + __m256d y1; + __m256d q1; + __m256d h1_real, h1_imag, h2_real, h2_imag; + __m256d tmp1; + int i=0; + + __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + + x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + y1 = _mm256_load_pd(&q_dbl[0]); + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm256_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + } + + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm256_xor_pd(h1_imag, sign); +#endif + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm256_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[0]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + + h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm256_xor_pd(h1_real, sign); + h1_imag = _mm256_xor_pd(h1_imag, sign); + h2_real = _mm256_xor_pd(h2_real, sign); + h2_imag = _mm256_xor_pd(h2_imag, sign); + + __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); + __m256d tmp2 = _mm256_broadcast_pd(&tmp_s_128); + tmp1 = _mm256_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); + h2_real = _mm256_broadcast_sd(&s_dbl[0]); + h2_imag = _mm256_broadcast_sd(&s_dbl[1]); + + tmp1 = _mm256_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#else + y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); +#endif + + tmp1 = _mm256_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + q1 = _mm256_load_pd(&q_dbl[0]); + + q1 = _mm256_add_pd(q1, y1); + + _mm256_store_pd(&q_dbl[0], q1); + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); + + q1 = _mm256_add_pd(q1, x1); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); + + for (i = 2; i < nb; i++) + { + q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm256_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); + } + h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm256_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#else + q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); +#endif + + _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp similarity index 93% rename from src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp rename to src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp index 1b5a731fdeddf34984de42b761bbd2ab17f5dd97..0fbd258cdb8c5519b4424c1fc239b910cb468d46 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp @@ -75,12 +75,12 @@ extern "C" { //Forward declaration -static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq); #if 0 -static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { std::complex x0; std::complex x1; @@ -129,7 +129,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex* } #endif // if 0 -void single_hh_trafo_complex_sse_1hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) +void single_hh_trafo_complex_sse_1hv_double_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) { int i; int nb = *pnb; @@ -139,19 +139,19 @@ void single_hh_trafo_complex_sse_1hv_(std::complex* q, std::complex 2) { - hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq); + hh_trafo_complex_kernel_4_SSE_1hv_double(&q[i], hh, nb, ldq); } else if (nq-i > 0) { - hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq); + hh_trafo_complex_kernel_2_SSE_1hv_double(&q[i], hh, nb, ldq); } } -static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -346,7 +346,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex } } -static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -491,7 +491,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex } } -static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d2759253d486b21e8e871c04773b011c2b061ed --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp @@ -0,0 +1,588 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + std::complex x0; + std::complex x1; + std::complex x2; + std::complex x3; + std::complex h0; + std::complex tau0; + int i=0; + + x0 = q[0]; + x1 = q[1]; + x2 = q[2]; + x3 = q[3]; + + for (i = 1; i < nb; i++) + { + h0 = conj(hh[i]); + x0 += (q[(i*ldq)+0] * h0); + x1 += (q[(i*ldq)+1] * h0); + x2 += (q[(i*ldq)+2] * h0); + x3 += (q[(i*ldq)+3] * h0); + } + + tau0 = hh[0]; + + h0 = (-1.0)*tau0; + + x0 *= h0; + x1 *= h0; + x2 *= h0; + x3 *= h0; + + q[0] += x0; + q[1] += x1; + q[2] += x2; + q[3] += x3; + + for (i = 1; i < nb; i++) + { + h0 = hh[i]; + q[(i*ldq)+0] += (x0*h0); + q[(i*ldq)+1] += (x1*h0); + q[(i*ldq)+2] += (x2*h0); + q[(i*ldq)+3] += (x3*h0); + } +} +#endif // if 0 + +void single_hh_trafo_complex_sse_1hv_single_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + //int ldh = *pldh; + + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_complex_kernel_6_SSE_1hv_single(&q[i], hh, nb, ldq); + } + if (nq-i > 2) + { + hh_trafo_complex_kernel_4_SSE_1hv_single(&q[i], hh, nb, ldq); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_2_SSE_1hv_single(&q[i], hh, nb, ldq); + } +} + +static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2, x3, x4, x5, x6; + __m128d q1, q2, q3, q4, q5, q6; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + x3 = _mm_load_pd(&q_dbl[4]); + x4 = _mm_load_pd(&q_dbl[6]); + x5 = _mm_load_pd(&q_dbl[8]); + x6 = _mm_load_pd(&q_dbl[10]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); + q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + tmp5 = _mm_mul_pd(h1_imag, q5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#else + x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#endif + tmp6 = _mm_mul_pd(h1_imag, q6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#else + x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + tmp5 = _mm_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); +#else + x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); +#endif + tmp6 = _mm_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); +#else + x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + q5 = _mm_load_pd(&q_dbl[8]); + q6 = _mm_load_pd(&q_dbl[10]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + q5 = _mm_add_pd(q5, x5); + q6 = _mm_add_pd(q6, x6); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + _mm_store_pd(&q_dbl[8], q5); + _mm_store_pd(&q_dbl[10], q6); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); + q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + tmp5 = _mm_mul_pd(h1_imag, x5); +#ifdef __ELPA_USE_FMA__ + q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#else + q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); +#endif + tmp6 = _mm_mul_pd(h1_imag, x6); +#ifdef __ELPA_USE_FMA__ + q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#else + q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + _mm_store_pd(&q_dbl[(2*i*ldq)+8], q5); + _mm_store_pd(&q_dbl[(2*i*ldq)+10], q6); + } +} + +static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2, x3, x4; + __m128d q1, q2, q3, q4; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + x3 = _mm_load_pd(&q_dbl[4]); + x4 = _mm_load_pd(&q_dbl[6]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + } +} + +static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(std::complex* q, std::complex* hh, int nb, int ldq) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + + __m128d x1, x2; + __m128d q1, q2; + __m128d h1_real, h1_imag; + __m128d tmp1, tmp2; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[0]); + x2 = _mm_load_pd(&q_dbl[2]); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + + for (i = 1; i < nb; i++) + { + h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + } +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp similarity index 96% rename from src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp rename to src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp index 8d1c0ad41dc231c4e5562c4a4b19cd95b7490fac..0494f95637378ce7f1f05669e1a534e1367bb0c8 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp @@ -73,13 +73,13 @@ extern "C" { //Forward declaration -static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); -static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); #if 0 -static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { std::complex x1; std::complex x2; @@ -176,7 +176,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex* } #endif -void double_hh_trafo_complex_sse_2hv_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void double_hh_trafo_complex_sse_2hv_double_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -193,25 +193,25 @@ void double_hh_trafo_complex_sse_2hv_(std::complex* q, std::complex 1) { - hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { - hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_complex_kernel_1_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #endif } -static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -644,7 +644,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); } -static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -997,7 +997,7 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); } -static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; @@ -1270,7 +1270,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); } -static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e8527bcc31579b44e8349521b934fb57fa85efcc --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp @@ -0,0 +1,1465 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + +#include +#include + +#define __forceinline __attribute__((always_inline)) + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +extern "C" { + +//Forward declaration +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s); + +#if 0 +static __forceinline void hh_trafo_complex_kernel_4_C_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + std::complex x1; + std::complex x2; + std::complex x3; + std::complex x4; + std::complex y1; + std::complex y2; + std::complex y3; + std::complex y4; + std::complex h1; + std::complex h2; + std::complex tau1; + std::complex tau2; + int i=0; + + x1 = q[ldq+0]; + x2 = q[ldq+1]; + x3 = q[ldq+2]; + x4 = q[ldq+3]; + + h2 = conj(hh[ldh+1]); + + y1 = q[0] + (x1*h2); + y2 = q[1] + (x2*h2); + y3 = q[2] + (x3*h2); + y4 = q[3] + (x4*h2); + + for (i = 2; i < nb; i++) + { + h1 = conj(hh[i-1]); + h2 = conj(hh[ldh+i]); + + x1 += (q[(i*ldq)+0] * h1); + y1 += (q[(i*ldq)+0] * h2); + x2 += (q[(i*ldq)+1] * h1); + y2 += (q[(i*ldq)+1] * h2); + x3 += (q[(i*ldq)+2] * h1); + y3 += (q[(i*ldq)+2] * h2); + x4 += (q[(i*ldq)+3] * h1); + y4 += (q[(i*ldq)+3] * h2); + } + h1 = conj(hh[nb-1]); + + x1 += (q[(nb*ldq)+0] * h1); + x2 += (q[(nb*ldq)+1] * h1); + x3 += (q[(nb*ldq)+2] * h1); + x4 += (q[(nb*ldq)+3] * h1); + + tau1 = hh[0]; + tau2 = hh[ldh]; + + h1 = (-1.0)*tau1; + + x1 *= h1; + x2 *= h1; + x3 *= h1; + x4 *= h1; + + h1 = (-1.0)*tau2; + h2 = (-1.0)*tau2; + h2 *= s; + y1 = y1*h1 +x1*h2; + y2 = y2*h1 +x2*h2; + y3 = y3*h1 +x3*h2; + y4 = y4*h1 +x4*h2; + + q[0] += y1; + q[1] += y2; + q[2] += y3; + q[3] += y4; + + h2 = hh[ldh+1]; + q[ldq+0] += (x1 + (y1*h2)); + q[ldq+1] += (x2 + (y2*h2)); + q[ldq+2] += (x3 + (y3*h2)); + q[ldq+3] += (x4 + (y4*h2)); + + for (i = 2; i < nb; i++) + { + h1 = hh[i-1]; + h2 = hh[ldh+i]; + + q[(i*ldq)+0] += ((x1*h1) + (y1*h2)); + q[(i*ldq)+1] += ((x2*h1) + (y2*h2)); + q[(i*ldq)+2] += ((x3*h1) + (y3*h2)); + q[(i*ldq)+3] += ((x4*h1) + (y4*h2)); + } + + h1 = hh[nb-1]; + q[(nb*ldq)+0] += (x1*h1); + q[(nb*ldq)+1] += (x2*h1); + q[(nb*ldq)+2] += (x3*h1); + q[(nb*ldq)+3] += (x4*h1); +} +#endif + +void double_hh_trafo_complex_sse_2hv_single_(std::complex* q, std::complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + std::complex s = conj(hh[(ldh)+1])*1.0; + for (i = 2; i < nb; i++) + { + s += hh[i-1] * conj(hh[(i+ldh)]); + } + +#if 1 + for (i = 0; i < nq; i+=4) + { + hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq-2; i+=3) + { + hh_trafo_complex_kernel_3_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + if (nq-i > 1) + { + hh_trafo_complex_kernel_2_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i > 0) + { + hh_trafo_complex_kernel_1_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} + +static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2, x3, x4; + __m128d y1, y2, y3, y4; + __m128d q1, q2, q3, q4; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2, tmp3, tmp4; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); + x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + y3 = _mm_load_pd(&q_dbl[4]); + y4 = _mm_load_pd(&q_dbl[6]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, q4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, q4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + tmp4 = _mm_mul_pd(h1_imag, y4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#else + y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, x4); +#ifdef __ELPA_USE_FMA__ + y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + q4 = _mm_load_pd(&q_dbl[6]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + q3 = _mm_add_pd(q3, y3); + q4 = _mm_add_pd(q4, y4); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + _mm_store_pd(&q_dbl[6], q4); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); + q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + q4 = _mm_add_pd(q4, x4); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + _mm_store_pd(&q_dbl[(ldq*2)+4], q3); + _mm_store_pd(&q_dbl[(ldq*2)+6], q4); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h2_imag, y4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + tmp4 = _mm_mul_pd(h1_imag, x4); +#ifdef __ELPA_USE_FMA__ + q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#else + q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); + _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); +} + +static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2, x3; + __m128d y1, y2, y3; + __m128d q1, q2, q3; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2, tmp3; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + y3 = _mm_load_pd(&q_dbl[4]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, q3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, q3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + tmp3 = _mm_mul_pd(h1_imag, y3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#else + y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, x3); +#ifdef __ELPA_USE_FMA__ + y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + q3 = _mm_load_pd(&q_dbl[4]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + q3 = _mm_add_pd(q3, y3); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + _mm_store_pd(&q_dbl[4], q3); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + q3 = _mm_add_pd(q3, x3); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + _mm_store_pd(&q_dbl[(ldq*2)+4], q3); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h2_imag, y3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + tmp3 = _mm_mul_pd(h1_imag, x3); +#ifdef __ELPA_USE_FMA__ + q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#else + q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); + _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); +} + +static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1, x2; + __m128d y1, y2; + __m128d q1, q2; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1, tmp2; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + y2 = _mm_load_pd(&q_dbl[2]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, q2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, q2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + tmp2 = _mm_mul_pd(h1_imag, y2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#else + y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, x2); +#ifdef __ELPA_USE_FMA__ + y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + q2 = _mm_load_pd(&q_dbl[2]); + + q1 = _mm_add_pd(q1, y1); + q2 = _mm_add_pd(q2, y2); + + _mm_store_pd(&q_dbl[0], q1); + _mm_store_pd(&q_dbl[2], q2); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); + + q1 = _mm_add_pd(q1, x1); + q2 = _mm_add_pd(q2, x2); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + _mm_store_pd(&q_dbl[(ldq*2)+2], q2); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h2_imag, y2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + tmp2 = _mm_mul_pd(h1_imag, x2); +#ifdef __ELPA_USE_FMA__ + q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#else + q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); + _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); +} + +static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_single(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) +{ + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; + double* s_dbl = (double*)(&s); + + __m128d x1; + __m128d y1; + __m128d q1; + __m128d h1_real, h1_imag, h2_real, h2_imag; + __m128d tmp1; + int i=0; + + __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); + + x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + y1 = _mm_load_pd(&q_dbl[0]); + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h2_imag = _mm_xor_pd(h2_imag, sign); +#endif + + tmp1 = _mm_mul_pd(h2_imag, q1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); +#ifndef __ELPA_USE_FMA__ + // conjugate + h1_imag = _mm_xor_pd(h1_imag, sign); +#endif + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm_mul_pd(h1_imag, q1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[0]); + h1_imag = _mm_loaddup_pd(&hh_dbl[1]); + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + + h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); + + h1_real = _mm_xor_pd(h1_real, sign); + h1_imag = _mm_xor_pd(h1_imag, sign); + h2_real = _mm_xor_pd(h2_real, sign); + h2_imag = _mm_xor_pd(h2_imag, sign); + + __m128d tmp2 = _mm_loadu_pd(s_dbl); + tmp1 = _mm_mul_pd(h2_imag, tmp2); +#ifdef __ELPA_USE_FMA__ + tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + _mm_storeu_pd(s_dbl, tmp2); + h2_real = _mm_loaddup_pd(&s_dbl[0]); + h2_imag = _mm_loaddup_pd(&s_dbl[1]); + + tmp1 = _mm_mul_pd(h1_imag, y1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#else + y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); +#endif + + tmp1 = _mm_mul_pd(h2_imag, x1); +#ifdef __ELPA_USE_FMA__ + y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + q1 = _mm_load_pd(&q_dbl[0]); + + q1 = _mm_add_pd(q1, y1); + + _mm_store_pd(&q_dbl[0], q1); + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); + + q1 = _mm_add_pd(q1, x1); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(ldq*2)+0], q1); + + for (i = 2; i < nb; i++) + { + q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); + + h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); + h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); + + tmp1 = _mm_mul_pd(h2_imag, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); + } + + h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); + h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); + + q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); + + tmp1 = _mm_mul_pd(h1_imag, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#else + q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); +#endif + + _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); +} +} // extern C diff --git a/src/elpa2_kernels/elpa2_kernels_real.F90 b/src/elpa2_kernels/elpa2_kernels_real.F90 index 620455d90b200aa2dd0abc73809b41ced5a96e27..ac0cf5a5ef23b31136f3d2241dcdd84820b7db62 100644 --- a/src/elpa2_kernels/elpa2_kernels_real.F90 +++ b/src/elpa2_kernels/elpa2_kernels_real.F90 @@ -106,7 +106,7 @@ module real_generic_kernel ! Calculate dot product of the two Householder vectors - s = hh(2,2)*1 + s = hh(2,2)*1.0 do i=3,nb s = s+hh(i,2)*hh(i-1,1) enddo diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c similarity index 94% rename from src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c rename to src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c index 7aa1b84773e635a0b4b266dd8086a108dce9321c..664f5fbf58fb6575219b7fe1fc38dde758aaacc2 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c @@ -81,17 +81,17 @@ #endif //Forward declaration -__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); -__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); -__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); -__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); -void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void double_hh_trafo_real_avx_avx2_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void double_hh_trafo_real_avx_avx2_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -112,7 +112,7 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn // Production level kernel calls with padding for (i = 0; i < nq-20; i+=24) { - hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } if (nq == i) @@ -122,25 +122,25 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn if (nq-i == 20) { - hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); - hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s); + hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_AVX_2hv_double(&q[i+16], hh, nb, ldq, ldh, s); } else if (nq-i == 16) { - hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i == 12) { - hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); - hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s); + hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_AVX_2hv_double(&q[i+8], hh, nb, ldq, ldh, s); } else if (nq-i == 8) { - hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } else { - hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } } @@ -167,12 +167,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, #ifdef __AVX__ for (i = 0; i < nq; i+=24) { - hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #else for (i = 0; i < nq; i+=12) { - hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #endif } @@ -184,7 +184,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ - __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) + __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [24 x nb+1] * hh @@ -498,7 +498,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ - __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) + __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [16 x nb+1] * hh @@ -732,7 +732,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ - __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) + __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+1] * hh @@ -886,7 +886,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ - __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) + __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+1] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..1b4f9a3af83c58aefcc2740c9e52eb966a969497 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c @@ -0,0 +1,1010 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_AVX2 + +#ifdef __FMA4__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c) +#endif + +#ifdef __AVX2__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c) +#endif + +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); +__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); +__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); +__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s); + +void double_hh_trafo_real_avx_avx2_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void double_hh_trafo_fast_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void double_hh_trafo_real_avx_avx2_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + // + // Fortran: + // s = hh(2,2)*1 + float s = hh[(ldh)+1]*1.0; + + // FORTRAN: + // do = 3, nb + // s =s + hh(i,2)*hh(i-1,1) + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding + for (i = 0; i < nq-20; i+=24) + { + hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + if (nq == i) + { + return; + } + if (nq-i == 20) + { + hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_AVX_2hv_single(&q[i+16], hh, nb, ldq, ldh, s); + } + else if (nq-i == 16) + { + hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i == 12) + { + hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_AVX_2hv_single(&q[i+8], hh, nb, ldq, ldh, s); + } + else if (nq-i == 8) + { + hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + else + { + hh_trafo_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +} + +#if 0 +void double_hh_trafo_fast_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + float s = hh[(ldh)+1]*1.0; + + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=24) + { + hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} +#endif + +/** + * Unrolled kernel that computes + * 24 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [24 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m256 sign = (__m256)_mm256_set1_epi32(0x80000000); + + __m256 x1 = _mm256_load_ps(&q[ldq]); //load q(1,2), q(2,2), q(3,2),q(4,2), q(5,2), q(6,2), q(7,2), q(8,2) + + __m256 x2 = _mm256_load_ps(&q[ldq+8]); // load q(9,2) ... q(16,2) + + __m256 x3 = _mm256_load_ps(&q[ldq+16]); // load q(17,2) .. q(24,2) +// __m256 x4 = _mm256_load_ps(&q[ldq+12]); +// __m256 x5 = _mm256_load_ps(&q[ldq+16]); +// __m256 x6 = _mm256_load_ps(&q[ldq+20]); + + __m256 h1 = _mm256_broadcast_ss(&hh[ldh+1]); // h1 = hh(2,2) | hh(2,2) | hh(2,2) | hh(2,2) | hh(2,2) | hh(2,2) | hh(2,2) | hh(2,2) + __m256 h2; + +#ifdef __ELPA_USE_FMA__ + __m256 q1 = _mm256_load_ps(q); // q1 = q(1,1), q(2,1), q(3,1), q(4,1), q(5,1), q(6,1), q(7,1), q(8,1) + __m256 y1 = _mm256_FMA_ps(x1, h1, q1); // y1 = q(1,2) * h(2,2) + q(1,1) | q(2,2) * h(2,2) + q(2,1) | .... | q(8,2) * h(2,2) + q(8,1) + __m256 q2 = _mm256_load_ps(&q[8]); // q2 = q(9,1) | .... | q(16,1) + __m256 y2 = _mm256_FMA_ps(x2, h1, q2); // y2 = q(9,2) * hh(2,2) + q(9,1) | ... | q(16,2) * h(2,2) + q(16,1) + __m256 q3 = _mm256_load_ps(&q[16]); // q3 = q(17,1) | ... | q(24,1) + __m256 y3 = _mm256_FMA_ps(x3, h1, q3); // y3 = q(17,2) * hh(2,2) + q(17,1) ... | q(24,2) * hh(2,2) + q(24,1) +// __m256 q4 = _mm256_load_ps(&q[12]); +// __m256 y4 = _mm256_FMA_ps(x4, h1, q4); +// __m256 q5 = _mm256_load_ps(&q[16]); +// __m256 y5 = _mm256_FMA_ps(x5, h1, q5); +// __m256 q6 = _mm256_load_ps(&q[20]); +// __m256 y6 = _mm256_FMA_ps(x6, h1, q6); +#else + __m256 q1 = _mm256_load_ps(q); // q1 = q(1,1), q(2,1), q(3,1), q(4,1), q(5,1), q(6,1), q(7,1), q(8,1) + __m256 y1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); // y1 = q(1,2) * h(2,2) + q(1,1) | q(2,2) * h(2,2) + q(2,1) | .... | q(8,2) * h(2,2) + q(8,1) + __m256 q2 = _mm256_load_ps(&q[8]); // q2 = q(9,1) | .... | q(16,1) + __m256 y2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); // y2 = q(9,2) * hh(2,2) + q(9,1) | ... | q(16,2) * h(2,2) + q(16,1) + __m256 q3 = _mm256_load_ps(&q[16]); // q3 = q(17,1) | ... | q(24,1) + __m256 y3 = _mm256_add_ps(q3, _mm256_mul_ps(x3, h1)); // y3 = q(17,2) * hh(2,2) + q(17,1) ... | q(24,2) * hh(2,2) + q(24,1) +// __m256 q4 = _mm256_load_ps(&q[12]); +// __m256 y4 = _mm256_add_ps(q4, _mm256_mul_ps(x4, h1)); +// __m256 q5 = _mm256_load_ps(&q[16]); +// __m256 y5 = _mm256_add_ps(q5, _mm256_mul_ps(x5, h1)); +// __m256 q6 = _mm256_load_ps(&q[20]); +// __m256 y6 = _mm256_add_ps(q6, _mm256_mul_ps(x6, h1)); +#endif + for(i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); // h1 = hh(i-1,1) | ... | hh(i-1,1) + h2 = _mm256_broadcast_ss(&hh[ldh+i]); // h2 = hh(i,2) | ... | hh(i,2) +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); // q1 = q(1,i) | q(2,i) | q(3,i) | ... | q(8,i) + x1 = _mm256_FMA_ps(q1, h1, x1); + y1 = _mm256_FMA_ps(q1, h2, y1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + x2 = _mm256_FMA_ps(q2, h1, x2); + y2 = _mm256_FMA_ps(q2, h2, y2); + q3 = _mm256_load_ps(&q[(i*ldq)+16]); + x3 = _mm256_FMA_ps(q3, h1, x3); + y3 = _mm256_FMA_ps(q3, h2, y3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// x4 = _mm256_FMA_ps(q4, h1, x4); +// y4 = _mm256_FMA_ps(q4, h2, y4); +// q5 = _mm256_load_ps(&q[(i*ldq)+16]); +// x5 = _mm256_FMA_ps(q5, h1, x5); +// y5 = _mm256_FMA_ps(q5, h2, y5); +// q6 = _mm256_load_ps(&q[(i*ldq)+20]); +// x6 = _mm256_FMA_ps(q6, h1, x6); +// y6 = _mm256_FMA_ps(q6, h2, y6); +#else + q1 = _mm256_load_ps(&q[i*ldq]); // q1 = q(1,i) | q(2,i) | q(3,i) | ... | q(8,i) + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); // x1 = q(1,i) * hh(i-1,1) + x1 | ... | q(8,i) ** hh(i-1,1) * x1 + y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); // y1 = q(1,i) * hh(i,2) + y1 | ... + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); + y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2)); + q3 = _mm256_load_ps(&q[(i*ldq)+16]); + x3 = _mm256_add_ps(x3, _mm256_mul_ps(q3,h1)); + y3 = _mm256_add_ps(y3, _mm256_mul_ps(q3,h2)); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// x4 = _mm256_add_ps(x4, _mm256_mul_ps(q4,h1)); +// y4 = _mm256_add_ps(y4, _mm256_mul_ps(q4,h2)); +// q5 = _mm256_load_ps(&q[(i*ldq)+16]); +// x5 = _mm256_add_ps(x5, _mm256_mul_ps(q5,h1)); +// y5 = _mm256_add_ps(y5, _mm256_mul_ps(q5,h2)); +// q6 = _mm256_load_ps(&q[(i*ldq)+20]); +// x6 = _mm256_add_ps(x6, _mm256_mul_ps(q6,h1)); +// y6 = _mm256_add_ps(y6, _mm256_mul_ps(q6,h2)); +#endif + } + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_FMA_ps(q1, h1, x1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + x2 = _mm256_FMA_ps(q2, h1, x2); + q3 = _mm256_load_ps(&q[(nb*ldq)+16]); + x3 = _mm256_FMA_ps(q3, h1, x3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// x4 = _mm256_FMA_ps(q4, h1, x4); +// q5 = _mm256_load_ps(&q[(nb*ldq)+16]); +// x5 = _mm256_FMA_ps(q5, h1, x5); +// q6 = _mm256_load_ps(&q[(nb*ldq)+20]); +// x6 = _mm256_FMA_ps(q6, h1, x6); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); + q3 = _mm256_load_ps(&q[(nb*ldq)+16]); + x3 = _mm256_add_ps(x3, _mm256_mul_ps(q3,h1)); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// x4 = _mm256_add_ps(x4, _mm256_mul_ps(q4,h1)); +// q5 = _mm256_load_ps(&q[(nb*ldq)+16]); +// x5 = _mm256_add_ps(x5, _mm256_mul_ps(q5,h1)); +// q6 = _mm256_load_ps(&q[(nb*ldq)+20]); +// x6 = _mm256_add_ps(x6, _mm256_mul_ps(q6,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [24 x nb+1] + ///////////////////////////////////////////////////// + + __m256 tau1 = _mm256_broadcast_ss(hh); + __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); + __m256 vs = _mm256_broadcast_ss(&s); + + +// carefull here + + h1 = _mm256_xor_ps(tau1, sign); + x1 = _mm256_mul_ps(x1, h1); + x2 = _mm256_mul_ps(x2, h1); + x3 = _mm256_mul_ps(x3, h1); +// x4 = _mm256_mul_ps(x4, h1); +// x5 = _mm256_mul_ps(x5, h1); +// x6 = _mm256_mul_ps(x6, h1); + h1 = _mm256_xor_ps(tau2, sign); + h2 = _mm256_mul_ps(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_ps(y1, h1, _mm256_mul_ps(x1,h2)); + y2 = _mm256_FMA_ps(y2, h1, _mm256_mul_ps(x2,h2)); + y3 = _mm256_FMA_ps(y3, h1, _mm256_mul_ps(x3,h2)); +// y4 = _mm256_FMA_ps(y4, h1, _mm256_mul_ps(x4,h2)); +// y5 = _mm256_FMA_ps(y5, h1, _mm256_mul_ps(x5,h2)); +// y6 = _mm256_FMA_ps(y6, h1, _mm256_mul_ps(x6,h2)); +#else + y1 = _mm256_add_ps(_mm256_mul_ps(y1,h1), _mm256_mul_ps(x1,h2)); + y2 = _mm256_add_ps(_mm256_mul_ps(y2,h1), _mm256_mul_ps(x2,h2)); + y3 = _mm256_add_ps(_mm256_mul_ps(y3,h1), _mm256_mul_ps(x3,h2)); +// y4 = _mm256_add_ps(_mm256_mul_ps(y4,h1), _mm256_mul_ps(x4,h2)); +// y5 = _mm256_add_ps(_mm256_mul_ps(y5,h1), _mm256_mul_ps(x5,h2)); +// y6 = _mm256_add_ps(_mm256_mul_ps(y6,h1), _mm256_mul_ps(x6,h2)); +#endif + + q1 = _mm256_load_ps(q); + q1 = _mm256_add_ps(q1, y1); + _mm256_store_ps(q,q1); + q2 = _mm256_load_ps(&q[8]); + q2 = _mm256_add_ps(q2, y2); + _mm256_store_ps(&q[8],q2); + q3 = _mm256_load_ps(&q[16]); + q3 = _mm256_add_ps(q3, y3); + _mm256_store_ps(&q[16],q3); +// q4 = _mm256_load_ps(&q[12]); +// q4 = _mm256_add_ps(q4, y4); +// _mm256_store_ps(&q[12],q4); +// q5 = _mm256_load_ps(&q[16]); +// q5 = _mm256_add_ps(q5, y5); +// _mm256_store_ps(&q[16],q5); +// q6 = _mm256_load_ps(&q[20]); +// q6 = _mm256_add_ps(q6, y6); +// _mm256_store_ps(&q[20],q6); + + h2 = _mm256_broadcast_ss(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_FMA_ps(y1, h2, x1)); + _mm256_store_ps(&q[ldq],q1); + q2 = _mm256_load_ps(&q[ldq+8]); + q2 = _mm256_add_ps(q2, _mm256_FMA_ps(y2, h2, x2)); + _mm256_store_ps(&q[ldq+8],q2); + q3 = _mm256_load_ps(&q[ldq+16]); + q3 = _mm256_add_ps(q3, _mm256_FMA_ps(y3, h2, x3)); + _mm256_store_ps(&q[ldq+16],q3); +// q4 = _mm256_load_ps(&q[ldq+12]); +// q4 = _mm256_add_ps(q4, _mm256_FMA_ps(y4, h2, x4)); +// _mm256_store_ps(&q[ldq+12],q4); +// q5 = _mm256_load_ps(&q[ldq+16]); +// q5 = _mm256_add_ps(q5, _mm256_FMA_ps(y5, h2, x5)); +// _mm256_store_ps(&q[ldq+16],q5); +// q6 = _mm256_load_ps(&q[ldq+20]); +// q6 = _mm256_add_ps(q6, _mm256_FMA_ps(y6, h2, x6)); +// _mm256_store_ps(&q[ldq+20],q6); +#else + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(x1, _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[ldq],q1); + q2 = _mm256_load_ps(&q[ldq+8]); + q2 = _mm256_add_ps(q2, _mm256_add_ps(x2, _mm256_mul_ps(y2, h2))); + _mm256_store_ps(&q[ldq+8],q2); + q3 = _mm256_load_ps(&q[ldq+16]); + q3 = _mm256_add_ps(q3, _mm256_add_ps(x3, _mm256_mul_ps(y3, h2))); + _mm256_store_ps(&q[ldq+16],q3); +// q4 = _mm256_load_ps(&q[ldq+12]); +// q4 = _mm256_add_ps(q4, _mm256_add_ps(x4, _mm256_mul_ps(y4, h2))); +// _mm256_store_ps(&q[ldq+12],q4); +// q5 = _mm256_load_ps(&q[ldq+16]); +// q5 = _mm256_add_ps(q5, _mm256_add_ps(x5, _mm256_mul_ps(y5, h2))); +// _mm256_store_ps(&q[ldq+16],q5); +// q6 = _mm256_load_ps(&q[ldq+20]); +// q6 = _mm256_add_ps(q6, _mm256_add_ps(x6, _mm256_mul_ps(y6, h2))); +// _mm256_store_ps(&q[ldq+20],q6); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + q1 = _mm256_FMA_ps(y1, h2, q1); + _mm256_store_ps(&q[i*ldq],q1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + q2 = _mm256_FMA_ps(x2, h1, q2); + q2 = _mm256_FMA_ps(y2, h2, q2); + _mm256_store_ps(&q[(i*ldq)+8],q2); + q3 = _mm256_load_ps(&q[(i*ldq)+16]); + q3 = _mm256_FMA_ps(x3, h1, q3); + q3 = _mm256_FMA_ps(y3, h2, q3); + _mm256_store_ps(&q[(i*ldq)+16],q3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// q4 = _mm256_FMA_ps(x4, h1, q4); +// q4 = _mm256_FMA_ps(y4, h2, q4); +// _mm256_store_ps(&q[(i*ldq)+12],q4); +// q5 = _mm256_load_ps(&q[(i*ldq)+16]); +/// q5 = _mm256_FMA_ps(x5, h1, q5); +// q5 = _mm256_FMA_ps(y5, h2, q5); +// _mm256_store_ps(&q[(i*ldq)+16],q5); +// q6 = _mm256_load_ps(&q[(i*ldq)+20]); +// q6 = _mm256_FMA_ps(x6, h1, q6); +// q6 = _mm256_FMA_ps(y6, h2, q6); +// _mm256_store_ps(&q[(i*ldq)+20],q6); +#else + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(_mm256_mul_ps(x1,h1), _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[i*ldq],q1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + q2 = _mm256_add_ps(q2, _mm256_add_ps(_mm256_mul_ps(x2,h1), _mm256_mul_ps(y2, h2))); + _mm256_store_ps(&q[(i*ldq)+8],q2); + q3 = _mm256_load_ps(&q[(i*ldq)+16]); + q3 = _mm256_add_ps(q3, _mm256_add_ps(_mm256_mul_ps(x3,h1), _mm256_mul_ps(y3, h2))); + _mm256_store_ps(&q[(i*ldq)+16],q3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// q4 = _mm256_add_ps(q4, _mm256_add_ps(_mm256_mul_ps(x4,h1), _mm256_mul_ps(y4, h2))); +// _mm256_store_ps(&q[(i*ldq)+12],q4); +// q5 = _mm256_load_ps(&q[(i*ldq)+16]); +// q5 = _mm256_add_ps(q5, _mm256_add_ps(_mm256_mul_ps(x5,h1), _mm256_mul_ps(y5, h2))); +// _mm256_store_ps(&q[(i*ldq)+16],q5); +// q6 = _mm256_load_ps(&q[(i*ldq)+20]); +// q6 = _mm256_add_ps(q6, _mm256_add_ps(_mm256_mul_ps(x6,h1), _mm256_mul_ps(y6, h2))); +// _mm256_store_ps(&q[(i*ldq)+20],q6); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + _mm256_store_ps(&q[nb*ldq],q1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + q2 = _mm256_FMA_ps(x2, h1, q2); + _mm256_store_ps(&q[(nb*ldq)+8],q2); + q3 = _mm256_load_ps(&q[(nb*ldq)+16]); + q3 = _mm256_FMA_ps(x3, h1, q3); + _mm256_store_ps(&q[(nb*ldq)+16],q3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +/// q4 = _mm256_FMA_ps(x4, h1, q4); +// _mm256_store_ps(&q[(nb*ldq)+12],q4); +// q5 = _mm256_load_ps(&q[(nb*ldq)+16]); +// q5 = _mm256_FMA_ps(x5, h1, q5); +// _mm256_store_ps(&q[(nb*ldq)+16],q5); +// q6 = _mm256_load_ps(&q[(nb*ldq)+20]); +// q6 = _mm256_FMA_ps(x6, h1, q6); +// _mm256_store_ps(&q[(nb*ldq)+20],q6); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); + _mm256_store_ps(&q[nb*ldq],q1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + q2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); + _mm256_store_ps(&q[(nb*ldq)+8],q2); + q3 = _mm256_load_ps(&q[(nb*ldq)+16]); + q3 = _mm256_add_ps(q3, _mm256_mul_ps(x3, h1)); + _mm256_store_ps(&q[(nb*ldq)+16],q3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// q4 = _mm256_add_ps(q4, _mm256_mul_ps(x4, h1)); +// _mm256_store_ps(&q[(nb*ldq)+12],q4); +// q5 = _mm256_load_ps(&q[(nb*ldq)+16]); +// q5 = _mm256_add_ps(q5, _mm256_mul_ps(x5, h1)); +// _mm256_store_ps(&q[(nb*ldq)+16],q5); +// q6 = _mm256_load_ps(&q[(nb*ldq)+20]); +// q6 = _mm256_add_ps(q6, _mm256_mul_ps(x6, h1)); +// _mm256_store_ps(&q[(nb*ldq)+20],q6); +#endif +} + +/** + * Unrolled kernel that computes + * 16 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [16 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m256 sign = (__m256)_mm256_set1_epi32(0x80000000); + + __m256 x1 = _mm256_load_ps(&q[ldq]); + __m256 x2 = _mm256_load_ps(&q[ldq+8]); +// __m256 x3 = _mm256_load_ps(&q[ldq+16]); +// __m256 x4 = _mm256_load_ps(&q[ldq+12]); + + __m256 h1 = _mm256_broadcast_ss(&hh[ldh+1]); + __m256 h2; + +#ifdef __ELPA_USE_FMA__ + __m256 q1 = _mm256_load_ps(q); + __m256 y1 = _mm256_FMA_ps(x1, h1, q1); + __m256 q2 = _mm256_load_ps(&q[8]); + __m256 y2 = _mm256_FMA_ps(x2, h1, q2); +// __m256 q3 = _mm256_load_ps(&q[16]); +// __m256 y3 = _mm256_FMA_ps(x3, h1, q3); +// __m256 q4 = _mm256_load_ps(&q[12]); +// __m256 y4 = _mm256_FMA_ps(x4, h1, q4); +#else + __m256 q1 = _mm256_load_ps(q); + __m256 y1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); + __m256 q2 = _mm256_load_ps(&q[8]); + __m256 y2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); +// __m256 q3 = _mm256_load_ps(&q[16]); +// __m256 y3 = _mm256_add_ps(q3, _mm256_mul_ps(x3, h1)); +// __m256 q4 = _mm256_load_ps(&q[12]); +// __m256 y4 = _mm256_add_ps(q4, _mm256_mul_ps(x4, h1)); +#endif + + for(i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); + x1 = _mm256_FMA_ps(q1, h1, x1); + y1 = _mm256_FMA_ps(q1, h2, y1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + x2 = _mm256_FMA_ps(q2, h1, x2); + y2 = _mm256_FMA_ps(q2, h2, y2); +// q3 = _mm256_load_ps(&q[(i*ldq)+8]); +// x3 = _mm256_FMA_ps(q3, h1, x3); +// y3 = _mm256_FMA_ps(q3, h2, y3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// x4 = _mm256_FMA_ps(q4, h1, x4); +// y4 = _mm256_FMA_ps(q4, h2, y4); +#else + q1 = _mm256_load_ps(&q[i*ldq]); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); + y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); + y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2)); +// q3 = _mm256_load_ps(&q[(i*ldq)+8]); +// x3 = _mm256_add_ps(x3, _mm256_mul_ps(q3,h1)); +// y3 = _mm256_add_ps(y3, _mm256_mul_ps(q3,h2)); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// x4 = _mm256_add_ps(x4, _mm256_mul_ps(q4,h1)); +// y4 = _mm256_add_ps(y4, _mm256_mul_ps(q4,h2)); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_FMA_ps(q1, h1, x1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + x2 = _mm256_FMA_ps(q2, h1, x2); +// q3 = _mm256_load_ps(&q[(nb*ldq)+8]); +// x3 = _mm256_FMA_ps(q3, h1, x3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// x4 = _mm256_FMA_ps(q4, h1, x4); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); +// q3 = _mm256_load_ps(&q[(nb*ldq)+8]); +// x3 = _mm256_add_ps(x3, _mm256_mul_ps(q3,h1)); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// x4 = _mm256_add_ps(x4, _mm256_mul_ps(q4,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [16 x nb+1] + ///////////////////////////////////////////////////// + + __m256 tau1 = _mm256_broadcast_ss(hh); + __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); + __m256 vs = _mm256_broadcast_ss(&s); + + +// carefulle + + h1 = _mm256_xor_ps(tau1, sign); + x1 = _mm256_mul_ps(x1, h1); + x2 = _mm256_mul_ps(x2, h1); +// x3 = _mm256_mul_ps(x3, h1); +// x4 = _mm256_mul_ps(x4, h1); + h1 = _mm256_xor_ps(tau2, sign); + h2 = _mm256_mul_ps(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_ps(y1, h1, _mm256_mul_ps(x1,h2)); + y2 = _mm256_FMA_ps(ys, h1, _mm256_mul_ps(x2,h2)); +// y3 = _mm256_FMA_ps(y3, h1, _mm256_mul_ps(x3,h2)); +// y4 = _mm256_FMA_ps(y4, h1, _mm256_mul_ps(x4,h2)); +#else + y1 = _mm256_add_ps(_mm256_mul_ps(y1,h1), _mm256_mul_ps(x1,h2)); + y2 = _mm256_add_ps(_mm256_mul_ps(y2,h1), _mm256_mul_ps(x2,h2)); +// y3 = _mm256_add_ps(_mm256_mul_ps(y3,h1), _mm256_mul_ps(x3,h2)); +// y4 = _mm256_add_ps(_mm256_mul_ps(y4,h1), _mm256_mul_ps(x4,h2)); +#endif + + q1 = _mm256_load_ps(q); + q1 = _mm256_add_ps(q1, y1); + _mm256_store_ps(q,q1); + q2 = _mm256_load_ps(&q[8]); + q2 = _mm256_add_ps(q2, y2); + _mm256_store_ps(&q[8],q2); +// q3 = _mm256_load_psa(&q[8]); +// q3 = _mm256_add_ps(q3, y3); +// _mm256_store_ps(&q[8],q3); +// q4 = _mm256_load_ps(&q[12]); +// q4 = _mm256_add_ps(q4, y4); +// _mm256_store_ps(&q[12],q4); + + h2 = _mm256_broadcast_ss(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_FMA_ps(y1, h2, x1)); + _mm256_store_ps(&q[ldq],q1); + q2 = _mm256_load_ps(&q[ldq+8]); + q2 = _mm256_add_ps(q2, _mm256_FMA_ps(y2, h2, x2)); + _mm256_store_ps(&q[ldq+8],q2); +// q3 = _mm256_load_ps(&q[ldq+8]); +// q3 = _mm256_add_ps(q3, _mm256_FMA_ps(y3, h2, x3)); +// _mm256_store_ps(&q[ldq+8],q3); +// q4 = _mm256_load_ps(&q[ldq+12]); +// q4 = _mm256_add_ps(q4, _mm256_FMA_ps(y4, h2, x4)); +// _mm256_store_ps(&q[ldq+12],q4); +#else + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(x1, _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[ldq],q1); + q2 = _mm256_load_ps(&q[ldq+8]); + q2 = _mm256_add_ps(q2, _mm256_add_ps(x2, _mm256_mul_ps(y2, h2))); + _mm256_store_ps(&q[ldq+8],q2); +// q3 = _mm256_load_ps(&q[ldq+8]); +// q3 = _mm256_add_ps(q3, _mm256_add_ps(x3, _mm256_mul_ps(y3, h2))); +// _mm256_store_ps(&q[ldq+8],q3); +// q4 = _mm256_load_ps(&q[ldq+12]); +// q4 = _mm256_add_ps(q4, _mm256_add_ps(x4, _mm256_mul_ps(y4, h2))); +// _mm256_store_ps(&q[ldq+12],q4); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + q1 = _mm256_FMA_ps(y1, h2, q1); + _mm256_store_ps(&q[i*ldq],q1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + q2 = _mm256_FMA_ps(x2, h1, q2); + q2 = _mm256_FMA_ps(y2, h2, q2); + _mm256_store_ps(&q[(i*ldq)+8],q2); +// q3 = _mm256_load_ps(&q[(i*ldq)+8]); +// q3 = _mm256_FMA_ps(x3, h1, q3); +// q3 = _mm256_FMA_ps(y3, h2, q3); +// _mm256_store_ps(&q[(i*ldq)+8],q3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// q4 = _mm256_FMA_ps(x4, h1, q4); +// q4 = _mm256_FMA_ps(y4, h2, q4); +// _mm256_store_ps(&q[(i*ldq)+12],q4); +#else + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(_mm256_mul_ps(x1,h1), _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[i*ldq],q1); + q2 = _mm256_load_ps(&q[(i*ldq)+8]); + q2 = _mm256_add_ps(q2, _mm256_add_ps(_mm256_mul_ps(x2,h1), _mm256_mul_ps(y2, h2))); + _mm256_store_ps(&q[(i*ldq)+8],q2); +// q3 = _mm256_load_ps(&q[(i*ldq)+8]); +// q3 = _mm256_add_ps(q3, _mm256_add_ps(_mm256_mul_ps(x3,h1), _mm256_mul_ps(y3, h2))); +// _mm256_store_ps(&q[(i*ldq)+8],q3); +// q4 = _mm256_load_ps(&q[(i*ldq)+12]); +// q4 = _mm256_add_ps(q4, _mm256_add_ps(_mm256_mul_ps(x4,h1), _mm256_mul_ps(y4, h2))); +// _mm256_store_ps(&q[(i*ldq)+12],q4); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + _mm256_store_ps(&q[nb*ldq],q1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + q2 = _mm256_FMA_ps(x2, h1, q2); + _mm256_store_ps(&q[(nb*ldq)+8],q2); +// q3 = _mm256_load_ps(&q[(nb*ldq)+8]); +// q3 = _mm256_FMA_ps(x3, h1, q3); +// _mm256_store_ps(&q[(nb*ldq)+8],q3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// q4 = _mm256_FMA_ps(x4, h1, q4); +// _mm256_store_ps(&q[(nb*ldq)+12],q4); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); + _mm256_store_ps(&q[nb*ldq],q1); + q2 = _mm256_load_ps(&q[(nb*ldq)+8]); + q2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); + _mm256_store_ps(&q[(nb*ldq)+8],q2); +// q3 = _mm256_load_ps(&q[(nb*ldq)+8]); +// q3 = _mm256_add_ps(q3, _mm256_mul_ps(x3, h1)); +// _mm256_store_ps(&q[(nb*ldq)+8],q3); +// q4 = _mm256_load_ps(&q[(nb*ldq)+12]); +// q4 = _mm256_add_ps(q4, _mm256_mul_ps(x4, h1)); +// _mm256_store_ps(&q[(nb*ldq)+12],q4); +#endif +} + +/** + * Unrolled kernel that computes + * 8 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m256 sign = (__m256)_mm256_set1_epi32(0x80000000); + + __m256 x1 = _mm256_load_ps(&q[ldq]); +// __m256 x2 = _mm256_load_ps(&q[ldq+8]); + + __m256 h1 = _mm256_broadcast_ss(&hh[ldh+1]); + __m256 h2; + +#ifdef __ELPA_USE_FMA__ + __m256 q1 = _mm256_load_ps(q); + __m256 y1 = _mm256_FMA_ps(x1, h1, q1); +// __m256 q2 = _mm256_load_ps(&q[4]); +// __m256 y2 = _mm256_FMA_ps(x2, h1, q2); +#else + __m256 q1 = _mm256_load_ps(q); + __m256 y1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); +// __m256 q2 = _mm256_load_ps(&q[4]); +// __m256 y2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); +#endif + + for(i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); + x1 = _mm256_FMA_ps(q1, h1, x1); + y1 = _mm256_FMA_ps(q1, h2, y1); +// q2 = _mm256_load_ps(&q[(i*ldq)+4]); +// x2 = _mm256_FMA_ps(q2, h1, x2); +// y2 = _mm256_FMA_ps(q2, h2, y2); +#else + q1 = _mm256_load_ps(&q[i*ldq]); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); + y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); +// q2 = _mm256_load_ps(&q[(i*ldq)+4]); +// x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); +// y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2)); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_FMA_ps(q1, h1, x1); +// q2 = _mm256_load_ps(&q[(nb*ldq)+4]); +// x2 = _mm256_FMA_ps(q2, h1, x2); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); +// q2 = _mm256_load_ps(&q[(nb*ldq)+4]); +// x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [8 x nb+1] + ///////////////////////////////////////////////////// + + __m256 tau1 = _mm256_broadcast_ss(hh); + __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); + __m256 vs = _mm256_broadcast_ss(&s); + +// carefulle + + h1 = _mm256_xor_ps(tau1, sign); + x1 = _mm256_mul_ps(x1, h1); +// x2 = _mm256_mul_ps(x2, h1); + h1 = _mm256_xor_ps(tau2, sign); + h2 = _mm256_mul_ps(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_ps(y1, h1, _mm256_mul_ps(x1,h2)); +// y2 = _mm256_FMA_ps(y2, h1, _mm256_mul_ps(x2,h2)); +#else + y1 = _mm256_add_ps(_mm256_mul_ps(y1,h1), _mm256_mul_ps(x1,h2)); +// y2 = _mm256_add_ps(_mm256_mul_ps(y2,h1), _mm256_mul_ps(x2,h2)); +#endif + + q1 = _mm256_load_ps(q); + q1 = _mm256_add_ps(q1, y1); + _mm256_store_ps(q,q1); +// q2 = _mm256_load_ps(&q[4]); +// q2 = _mm256_add_ps(q2, y2); +// _mm256_store_ps(&q[4],q2); + + h2 = _mm256_broadcast_ss(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_FMA_ps(y1, h2, x1)); + _mm256_store_ps(&q[ldq],q1); +// q2 = _mm256_load_ps(&q[ldq+4]); +// q2 = _mm256_add_ps(q2, _mm256_FMA_ps(y2, h2, x2)); +// _mm256_store_ps(&q[ldq+4],q2); +#else + q1 = _mm256_load_ps(&q[ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(x1, _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[ldq],q1); +// q2 = _mm256_load_ps(&q[ldq+4]); +// q2 = _mm256_add_ps(q2, _mm256_add_ps(x2, _mm256_mul_ps(y2, h2))); +// _mm256_store_ps(&q[ldq+4],q2); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + q1 = _mm256_FMA_ps(y1, h2, q1); + _mm256_store_ps(&q[i*ldq],q1); +// q2 = _mm256_load_ps(&q[(i*ldq)+4]); +// q2 = _mm256_FMA_ps(x2, h1, q2); +// q2 = _mm256_FMA_ps(y2, h2, q2); +// _mm256_store_ps(&q[(i*ldq)+4],q2); +#else + q1 = _mm256_load_ps(&q[i*ldq]); + q1 = _mm256_add_ps(q1, _mm256_add_ps(_mm256_mul_ps(x1,h1), _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[i*ldq],q1); +// q2 = _mm256_load_ps(&q[(i*ldq)+4]); +// q2 = _mm256_add_ps(q2, _mm256_add_ps(_mm256_mul_ps(x2,h1), _mm256_mul_ps(y2, h2))); +// _mm256_store_ps(&q[(i*ldq)+4],q2); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_FMA_ps(x1, h1, q1); + _mm256_store_ps(&q[nb*ldq],q1); +// q2 = _mm256_load_ps(&q[(nb*ldq)+4]); +// q2 = _mm256_FMA_ps(x2, h1, q2); +// _mm256_store_ps(&q[(nb*ldq)+4],q2); +#else + q1 = _mm256_load_ps(&q[nb*ldq]); + q1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); + _mm256_store_ps(&q[nb*ldq],q1); +// q2 = _mm256_load_ps(&q[(nb*ldq)+4]); +// q2 = _mm256_add_ps(q2, _mm256_mul_ps(x2, h1)); +// _mm256_store_ps(&q[(nb*ldq)+4],q2); +#endif +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m256 sign = (__m256)_mm256_set1_epi32(0x80000000); + + __m256 x1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq])); + + __m256 h1 = _mm256_broadcast_ss(&hh[ldh+1]); + __m256 h2; + +#ifdef __ELPA_USE_FMA__ + __m256 q1 = _mm256_castps128_ps256(_mm_load_ps(q)); + __m256 y1 = _mm256_FMA_ps(x1, h1, q1); +#else + __m256 q1 = _mm256_castps128_ps256(_mm_load_ps(q)); + __m256 y1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); +#endif + + for(i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); + x1 = _mm256_FMA_ps(q1, h1, x1); + y1 = _mm256_FMA_ps(q1, h2, y1); +#else + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); + y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); + x1 = _mm256_FMA_ps(q1, h1, x1); +#else + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); + x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [4 x nb+1] + ///////////////////////////////////////////////////// + + __m256 tau1 = _mm256_broadcast_ss(hh); + __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); + __m256 vs = _mm256_broadcast_ss(&s); + +//carefull + + h1 = _mm256_xor_ps(tau1, sign); + x1 = _mm256_mul_ps(x1, h1); + h1 = _mm256_xor_ps(tau2, sign); + h2 = _mm256_mul_ps(h1, vs); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_ps(y1, h1, _mm256_mul_ps(x1,h2)); +#else + y1 = _mm256_add_ps(_mm256_mul_ps(y1,h1), _mm256_mul_ps(x1,h2)); +#endif + + q1 = _mm256_castps128_ps256(_mm_load_ps(q)); + q1 = _mm256_add_ps(q1, y1); + _mm256_store_ps(q,q1); + + h2 = _mm256_broadcast_ss(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq])); + q1 = _mm256_add_ps(q1, _mm256_FMA_ps(y1, h2, x1)); + _mm256_store_ps(&q[ldq],q1); +#else + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq])); + q1 = _mm256_add_ps(q1, _mm256_add_ps(x1, _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[ldq],q1); +#endif + + for (i = 2; i < nb; i++) + { + h1 = _mm256_broadcast_ss(&hh[i-1]); + h2 = _mm256_broadcast_ss(&hh[ldh+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); + q1 = _mm256_FMA_ps(x1, h1, q1); + q1 = _mm256_FMA_ps(y1, h2, q1); + _mm256_store_ps(&q[i*ldq],q1); +#else + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); + q1 = _mm256_add_ps(q1, _mm256_add_ps(_mm256_mul_ps(x1,h1), _mm256_mul_ps(y1, h2))); + _mm256_store_ps(&q[i*ldq],q1); +#endif + } + + h1 = _mm256_broadcast_ss(&hh[nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); + q1 = _mm256_FMA_ps(x1, h1, q1); + _mm256_store_ps(&q[nb*ldq],q1); +#else + q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); + q1 = _mm256_add_ps(q1, _mm256_mul_ps(x1, h1)); + _mm256_store_ps(&q[nb*ldq],q1); +#endif +} + diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c similarity index 94% rename from src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c rename to src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c index 97ba19abcebf45a26ec92ac9f32ae7f8db1ee192..d565106d6fda1e3c9301694b639915a4b9bf43fd 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c @@ -84,16 +84,16 @@ #endif //Forward declaration -__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void quad_hh_trafo_real_avx_avx2_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 -void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void quad_hh_trafo_real_avx_avx2_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -148,7 +148,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, #ifdef __AVX__ for (i = 0; i < nq-8; i+=12) { - hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { @@ -158,17 +158,17 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, { if (nq-i > 4) { - hh_trafo_kernel_8_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { - hh_trafo_kernel_4_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } #else for (i = 0; i < nq-4; i+=6) { - hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { @@ -178,18 +178,18 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, { if (nq-i > 2) { - hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { - hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } #endif } #if 0 -void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -237,12 +237,12 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i #ifdef __AVX__ for (i = 0; i < nq; i+=12) { - hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #else for (i = 0; i < nq; i+=6) { - hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #endif } @@ -254,7 +254,7 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [12 x nb+3] * hh @@ -782,7 +782,7 @@ __forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh @@ -1137,7 +1137,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..549a79c685aed64bd49e5481d3037e1e27102bb1 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c @@ -0,0 +1,1376 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_AVX2 + +#ifdef __FMA4__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) +#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c) +#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c) +#endif + +#ifdef __AVX2__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) +#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c) +#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) +#endif + +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_4_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_8_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_12_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); + +void quad_hh_trafo_real_avx_avx2_4hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void quad_hh_trafo_fast_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void quad_hh_trafo_real_avx_avx2_4hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + +// printf("s_1_2: %f\n", s_1_2); +// printf("s_1_3: %f\n", s_1_3); +// printf("s_2_3: %f\n", s_2_3); +// printf("s_1_4: %f\n", s_1_4); +// printf("s_2_4: %f\n", s_2_4); +// printf("s_3_4: %f\n", s_3_4); + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq-8; i+=12) + { + hh_trafo_kernel_12_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 4) + { + hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + else + { + hh_trafo_kernel_4_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + } +#else + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_kernel_6_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 2) + { + hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + else + { + hh_trafo_kernel_2_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + } +#endif +} + +#if 0 +void quad_hh_trafo_fast_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#else + for (i = 0; i < nq; i+=6) + { + hh_trafo_kernel_6_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#endif +} +#endif + +/** + * Unrolled kernel that computes + * 12 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_12_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [12 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); + __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); + __m256d a3_1 = _mm256_load_pd(&q[ldq]); + __m256d a4_1 = _mm256_load_pd(&q[0]); + + __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); + __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); + __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); + w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); + w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); + register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); + z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); + register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); + register __m256d x1 = a1_1; +#else + register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); + register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); + register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); + register __m256d x1 = a1_1; +#endif + + __m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]); + __m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]); + __m256d a3_2 = _mm256_load_pd(&q[ldq+4]); + __m256d a4_2 = _mm256_load_pd(&q[0+4]); + +#ifdef __ELPA_USE_FMA__ + register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); + w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); + w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); + register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); + z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); + register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); + register __m256d x2 = a1_2; +#else + register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); + register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); + register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); + register __m256d x2 = a1_2; +#endif + + __m256d a1_3 = _mm256_load_pd(&q[(ldq*3)+8]); + __m256d a2_3 = _mm256_load_pd(&q[(ldq*2)+8]); + __m256d a3_3 = _mm256_load_pd(&q[ldq+8]); + __m256d a4_3 = _mm256_load_pd(&q[0+8]); + +#ifdef __ELPA_USE_FMA__ + register __m256d w3 = _mm256_FMA_pd(a3_3, h_4_3, a4_3); + w3 = _mm256_FMA_pd(a2_3, h_4_2, w3); + w3 = _mm256_FMA_pd(a1_3, h_4_1, w3); + register __m256d z3 = _mm256_FMA_pd(a2_3, h_3_2, a3_3); + z3 = _mm256_FMA_pd(a1_3, h_3_1, z3); + register __m256d y3 = _mm256_FMA_pd(a1_3, h_2_1, a2_3); + register __m256d x3 = a1_3; +#else + register __m256d w3 = _mm256_add_pd(a4_3, _mm256_mul_pd(a3_3, h_4_3)); + w3 = _mm256_add_pd(w3, _mm256_mul_pd(a2_3, h_4_2)); + w3 = _mm256_add_pd(w3, _mm256_mul_pd(a1_3, h_4_1)); + register __m256d z3 = _mm256_add_pd(a3_3, _mm256_mul_pd(a2_3, h_3_2)); + z3 = _mm256_add_pd(z3, _mm256_mul_pd(a1_3, h_3_1)); + register __m256d y3 = _mm256_add_pd(a2_3, _mm256_mul_pd(a1_3, h_2_1)); + register __m256d x3 = a1_3; +#endif + + __m256d q1; + __m256d q2; + __m256d q3; + + __m256d h1; + __m256d h2; + __m256d h3; + __m256d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + q1 = _mm256_load_pd(&q[i*ldq]); + q2 = _mm256_load_pd(&q[(i*ldq)+4]); + q3 = _mm256_load_pd(&q[(i*ldq)+8]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + x3 = _mm256_FMA_pd(q3, h1, x3); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); +#endif + + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); + y3 = _mm256_FMA_pd(q3, h2, y3); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); + y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); +#endif + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); + z3 = _mm256_FMA_pd(q3, h3, z3); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); + z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3)); +#endif + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); + w2 = _mm256_FMA_pd(q2, h4, w2); + w3 = _mm256_FMA_pd(q3, h4, w3); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); + w3 = _mm256_add_pd(w3, _mm256_mul_pd(q3,h4)); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); + q3 = _mm256_load_pd(&q[(nb*ldq)+8]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + x3 = _mm256_FMA_pd(q3, h1, x3); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); +#endif + + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __FMA4_ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); + y3 = _mm256_FMA_pd(q3, h2, y3); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); + y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); +#endif + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); + z3 = _mm256_FMA_pd(q3, h3, z3); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); + z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); + q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + x3 = _mm256_FMA_pd(q3, h1, x3); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); +#endif + + h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); + +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); + y3 = _mm256_FMA_pd(q3, h2, y3); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); + y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); + q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + x3 = _mm256_FMA_pd(q3, h1, x3); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [12 x nb+3] + ///////////////////////////////////////////////////// + + __m256d tau1 = _mm256_broadcast_sd(&hh[0]); + + h1 = tau1; + x1 = _mm256_mul_pd(x1, h1); + x2 = _mm256_mul_pd(x2, h1); + x3 = _mm256_mul_pd(x3, h1); + + __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); + __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); + + h1 = tau2; + h2 = _mm256_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); + y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2)); + y3 = _mm256_FMSUB_pd(y3, h1, _mm256_mul_pd(x3,h2)); +#else + y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); + y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); + y3 = _mm256_sub_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2)); +#endif + + __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); + __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); + __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); + + h1 = tau3; + h2 = _mm256_mul_pd(h1, vs_1_3); + h3 = _mm256_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); + z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); + z3 = _mm256_FMSUB_pd(z3, h1, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2))); +#else + z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); + z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); + z3 = _mm256_sub_pd(_mm256_mul_pd(z3,h1), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2))); +#endif + + __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); + __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); + __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); + __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); + + h1 = tau4; + h2 = _mm256_mul_pd(h1, vs_1_4); + h3 = _mm256_mul_pd(h1, vs_2_4); + h4 = _mm256_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); + w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); + w3 = _mm256_FMSUB_pd(w3, h1, _mm256_FMA_pd(z3, h4, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2)))); +#else + w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); + w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); + w3 = _mm256_sub_pd(_mm256_mul_pd(w3,h1), _mm256_add_pd(_mm256_mul_pd(z3,h4), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2)))); +#endif + + q1 = _mm256_load_pd(&q[0]); + q2 = _mm256_load_pd(&q[4]); + q3 = _mm256_load_pd(&q[8]); + q1 = _mm256_sub_pd(q1, w1); + q2 = _mm256_sub_pd(q2, w2); + q3 = _mm256_sub_pd(q3, w3); + _mm256_store_pd(&q[0],q1); + _mm256_store_pd(&q[4],q2); + _mm256_store_pd(&q[8],q3); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + q1 = _mm256_load_pd(&q[ldq]); + q2 = _mm256_load_pd(&q[ldq+4]); + q3 = _mm256_load_pd(&q[ldq+8]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); + q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2)); + q3 = _mm256_sub_pd(q3, _mm256_FMA_pd(w3, h4, z3)); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4))); + q3 = _mm256_sub_pd(q3, _mm256_add_pd(z3, _mm256_mul_pd(w3, h4))); +#endif + _mm256_store_pd(&q[ldq],q1); + _mm256_store_pd(&q[ldq+4],q2); + _mm256_store_pd(&q[ldq+8],q3); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + q1 = _mm256_load_pd(&q[ldq*2]); + q2 = _mm256_load_pd(&q[(ldq*2)+4]); + q3 = _mm256_load_pd(&q[(ldq*2)+8]); + q1 = _mm256_sub_pd(q1, y1); + q2 = _mm256_sub_pd(q2, y2); + q3 = _mm256_sub_pd(q3, y3); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); + q3 = _mm256_NFMA_pd(w3, h4, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q3 = _mm256_NFMA_pd(z3, h3, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3)); +#endif + _mm256_store_pd(&q[ldq*2],q1); + _mm256_store_pd(&q[(ldq*2)+4],q2); + _mm256_store_pd(&q[(ldq*2)+8],q3); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + q1 = _mm256_load_pd(&q[ldq*3]); + q2 = _mm256_load_pd(&q[(ldq*3)+4]); + q3 = _mm256_load_pd(&q[(ldq*3)+8]); + q1 = _mm256_sub_pd(q1, x1); + q2 = _mm256_sub_pd(q2, x2); + q3 = _mm256_sub_pd(q3, x3); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); + q3 = _mm256_NFMA_pd(w3, h4, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q3 = _mm256_NFMA_pd(y3, h2, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q3 = _mm256_NFMA_pd(z3, h3, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3)); +#endif + _mm256_store_pd(&q[ldq*3], q1); + _mm256_store_pd(&q[(ldq*3)+4], q2); + _mm256_store_pd(&q[(ldq*3)+8], q3); + + for (i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + + q1 = _mm256_load_pd(&q[i*ldq]); + q2 = _mm256_load_pd(&q[(i*ldq)+4]); + q3 = _mm256_load_pd(&q[(i*ldq)+8]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q3 = _mm256_NFMA_pd(x3, h1, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); +#endif + + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q3 = _mm256_NFMA_pd(y3, h2, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); +#endif + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q3 = _mm256_NFMA_pd(z3, h3, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3)); +#endif + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); + q3 = _mm256_NFMA_pd(w3, h4, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1,h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2,h4)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3,h4)); +#endif + + _mm256_store_pd(&q[i*ldq],q1); + _mm256_store_pd(&q[(i*ldq)+4],q2); + _mm256_store_pd(&q[(i*ldq)+8],q3); + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); + q3 = _mm256_load_pd(&q[(nb*ldq)+8]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q3 = _mm256_NFMA_pd(x3, h1, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q3 = _mm256_NFMA_pd(y3, h2, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q3 = _mm256_NFMA_pd(z3, h3, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3)); +#endif + _mm256_store_pd(&q[nb*ldq],q1); + _mm256_store_pd(&q[(nb*ldq)+4],q2); + _mm256_store_pd(&q[(nb*ldq)+8],q3); + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); + q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q3 = _mm256_NFMA_pd(x3, h1, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q3 = _mm256_NFMA_pd(y3, h2, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); +#endif + _mm256_store_pd(&q[(nb+1)*ldq],q1); + _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); + _mm256_store_pd(&q[((nb+1)*ldq)+8],q3); + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); + q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q3 = _mm256_NFMA_pd(x3, h1, q3); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); + q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); +#endif + _mm256_store_pd(&q[(nb+2)*ldq],q1); + _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); + _mm256_store_pd(&q[((nb+2)*ldq)+8],q3); +} + +/** + * Unrolled kernel that computes + * 8 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_8_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); + __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); + __m256d a3_1 = _mm256_load_pd(&q[ldq]); + __m256d a4_1 = _mm256_load_pd(&q[0]); + + __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); + __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); + __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); + w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); + w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); + __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); + z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); + __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); + __m256d x1 = a1_1; +#else + __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); + __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); + __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); + __m256d x1 = a1_1; +#endif + + __m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]); + __m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]); + __m256d a3_2 = _mm256_load_pd(&q[ldq+4]); + __m256d a4_2 = _mm256_load_pd(&q[0+4]); + +#ifdef __ELPA_USE_FMA__ + __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); + w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); + w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); + __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); + z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); + __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); + __m256d x2 = a1_2; +#else + __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); + __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); + __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); + __m256d x2 = a1_2; +#endif + + __m256d q1; + __m256d q2; + + __m256d h1; + __m256d h2; + __m256d h3; + __m256d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); + + q1 = _mm256_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + y1 = _mm256_FMA_pd(q1, h2, y1); + z1 = _mm256_FMA_pd(q1, h3, z1); + w1 = _mm256_FMA_pd(q1, h4, w1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); +#endif + + q2 = _mm256_load_pd(&q[(i*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x2 = _mm256_FMA_pd(q2, h1, x2); + y2 = _mm256_FMA_pd(q2, h2, y2); + z2 = _mm256_FMA_pd(q2, h3, z2); + w2 = _mm256_FMA_pd(q2, h4, w2); +#else + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); + + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); + + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); + +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [8 x nb+3] + ///////////////////////////////////////////////////// + + __m256d tau1 = _mm256_broadcast_sd(&hh[0]); + __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); + __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); + __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); + + __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); + __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); + __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); + __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); + __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); + __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); + + h1 = tau1; + x1 = _mm256_mul_pd(x1, h1); + x2 = _mm256_mul_pd(x2, h1); + + h1 = tau2; + h2 = _mm256_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); + y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2)); +#else + y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); + y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); +#endif + + h1 = tau3; + h2 = _mm256_mul_pd(h1, vs_1_3); + h3 = _mm256_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); + z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); +#else + z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); + z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); +#endif + + h1 = tau4; + h2 = _mm256_mul_pd(h1, vs_1_4); + h3 = _mm256_mul_pd(h1, vs_2_4); + h4 = _mm256_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); + w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); +#else + w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); + w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); +#endif + + q1 = _mm256_load_pd(&q[0]); + q2 = _mm256_load_pd(&q[4]); + q1 = _mm256_sub_pd(q1, w1); + q2 = _mm256_sub_pd(q2, w2); + _mm256_store_pd(&q[0],q1); + _mm256_store_pd(&q[4],q2); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + q1 = _mm256_load_pd(&q[ldq]); + q2 = _mm256_load_pd(&q[ldq+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); + q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2)); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4))); +#endif + _mm256_store_pd(&q[ldq],q1); + _mm256_store_pd(&q[ldq+4],q2); + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + q1 = _mm256_load_pd(&q[ldq*2]); + q2 = _mm256_load_pd(&q[(ldq*2)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, y1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_sub_pd(q2, y2); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(y2, _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4)))); +#endif + _mm256_store_pd(&q[ldq*2],q1); + _mm256_store_pd(&q[(ldq*2)+4],q2); + + h2 = _mm256_broadcast_sd(&hh[ldh+1]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + q1 = _mm256_load_pd(&q[ldq*3]); + q2 = _mm256_load_pd(&q[(ldq*3)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, x1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_sub_pd(q2, x2); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(x2, _mm256_add_pd(_mm256_mul_pd(y2, h2), _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4))))); +#endif + _mm256_store_pd(&q[ldq*3], q1); + _mm256_store_pd(&q[(ldq*3)+4], q2); + + for (i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); + +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_load_pd(&q[i*ldq]); + q2 = _mm256_load_pd(&q[(i*ldq)+4]); + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q2 = _mm256_NFMA_pd(z2, h3, q2); + q2 = _mm256_NFMA_pd(w2, h4, q2); + _mm256_store_pd(&q[i*ldq],q1); + _mm256_store_pd(&q[(i*ldq)+4],q2); +#else + q1 = _mm256_load_pd(&q[i*ldq]); + q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)))); + _mm256_store_pd(&q[i*ldq],q1); + + q2 = _mm256_load_pd(&q[(i*ldq)+4]); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2, h4), _mm256_mul_pd(z2, h3)), _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)))); + _mm256_store_pd(&q[(i*ldq)+4],q2); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q2 = _mm256_NFMA_pd(y2, h2, q2); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(y2, h2)) , _mm256_mul_pd(x2, h1))); +#endif + _mm256_store_pd(&q[nb*ldq],q1); + _mm256_store_pd(&q[(nb*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1))); + q2 = _mm256_sub_pd(q2, _mm256_add_pd( _mm256_mul_pd(y2, h2) , _mm256_mul_pd(x2, h1))); +#endif + _mm256_store_pd(&q[(nb+1)*ldq],q1); + _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + _mm256_store_pd(&q[(nb+2)*ldq],q1); + _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_AVX_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); + __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); + __m256d a3_1 = _mm256_load_pd(&q[ldq]); + __m256d a4_1 = _mm256_load_pd(&q[0]); + + __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); + __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); + __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + +#ifdef __ELPA_USE_FMA__ + __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); + w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); + w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); + __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); + z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); + __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); + __m256d x1 = a1_1; +#else + __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); + __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); + __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); + __m256d x1 = a1_1; +#endif + + __m256d q1; + + __m256d h1; + __m256d h2; + __m256d h3; + __m256d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); + + q1 = _mm256_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + y1 = _mm256_FMA_pd(q1, h2, y1); + z1 = _mm256_FMA_pd(q1, h3, z1); + w1 = _mm256_FMA_pd(q1, h4, w1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); + q1 = _mm256_load_pd(&q[nb*ldq]); +#ifdef _FMA4__ + x1 = _mm256_FMA_pd(q1, h1, x1); + y1 = _mm256_FMA_pd(q1, h2, y1); + z1 = _mm256_FMA_pd(q1, h3, z1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + __m256d tau1 = _mm256_broadcast_sd(&hh[0]); + __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); + __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); + __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); + + __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); + __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); + __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); + __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); + __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); + __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); + + h1 = tau1; + x1 = _mm256_mul_pd(x1, h1); + + h1 = tau2; + h2 = _mm256_mul_pd(h1, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); +#else + y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); +#endif + + h1 = tau3; + h2 = _mm256_mul_pd(h1, vs_1_3); + h3 = _mm256_mul_pd(h1, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); +#else + z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); +#endif + + h1 = tau4; + h2 = _mm256_mul_pd(h1, vs_1_4); + h3 = _mm256_mul_pd(h1, vs_2_4); + h4 = _mm256_mul_pd(h1, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); +#else + w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); +#endif + + q1 = _mm256_load_pd(&q[0]); + q1 = _mm256_sub_pd(q1, w1); + _mm256_store_pd(&q[0],q1); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + q1 = _mm256_load_pd(&q[ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); +#endif + _mm256_store_pd(&q[ldq],q1); + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + q1 = _mm256_load_pd(&q[ldq*2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, y1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))); +#endif + _mm256_store_pd(&q[ldq*2],q1); + + h2 = _mm256_broadcast_sd(&hh[ldh+1]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); + q1 = _mm256_load_pd(&q[ldq*3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_sub_pd(q1, x1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))))); +#endif + _mm256_store_pd(&q[ldq*3], q1); + + for (i = 4; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); + + q1 = _mm256_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)))); +#endif + _mm256_store_pd(&q[i*ldq],q1); + } + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); + q1 = _mm256_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1))); +#endif + _mm256_store_pd(&q[nb*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1))); +#endif + _mm256_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + _mm256_store_pd(&q[(nb+2)*ldq],q1); +} + diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c similarity index 98% rename from src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c rename to src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c index 67338c471297a6a5e6355b62e877d9be36f267d0..e8342a9d99cba3f1a6eb6621e226aa4ae4e21783 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c @@ -85,15 +85,15 @@ #endif //Forward declaration -static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); -static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); -void hexa_hh_trafo_real_avx_avx2_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void hexa_hh_trafo_real_avx_avx2_6hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void hexa_hh_trafo_real_avx_avx2_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void hexa_hh_trafo_real_avx_avx2_6hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -236,7 +236,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_(double* q, double* hh, int* pnb, int* pnq, #ifdef __AVX__ for (i = 0; i < nq-4; i+=8) { - hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { @@ -244,12 +244,12 @@ void hexa_hh_trafo_real_avx_avx2_6hv_(double* q, double* hh, int* pnb, int* pnq, } else { - hh_trafo_kernel_4_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq-2; i+=4) { - hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { @@ -257,7 +257,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_(double* q, double* hh, int* pnb, int* pnq, } else { - hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } @@ -406,12 +406,12 @@ void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i #ifdef __AVX__ for (i = 0; i < nq; i+=8) { - hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq; i+=4) { - hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } @@ -423,7 +423,7 @@ void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +__forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh @@ -1196,7 +1196,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +__forceinline void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..ccd185fb2e3f856b317da105b2414129221559ae --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c @@ -0,0 +1,1760 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_AVX2 + +#ifdef __FMA4__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) +#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c) +#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c) +#endif + +#ifdef __AVX2__ +#define __ELPA_USE_FMA__ +#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) +#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c) +#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) +#endif + +#endif + +//Forward declaration +static void hh_trafo_kernel_4_AVX_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_8_AVX_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); + +void hexa_hh_trafo_real_avx_avx2_6hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void hexa_hh_trafo_real_avx_avx2_6hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq-4; i+=8) + { + hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } + if (nq == i) + { + return; + } + else + { + hh_trafo_kernel_4_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#else + for (i = 0; i < nq-2; i+=4) + { + hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } + if (nq == i) + { + return; + } + else + { + hh_trafo_kernel_2_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#endif +} + +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=8) + { + hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#else + for (i = 0; i < nq; i+=4) + { + hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#endif +} +#endif + +/** + * Unrolled kernel that computes + * 8 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_8_AVX_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m256d a1_1 = _mm256_load_pd(&q[ldq*5]); + __m256d a2_1 = _mm256_load_pd(&q[ldq*4]); + __m256d a3_1 = _mm256_load_pd(&q[ldq*3]); + __m256d a4_1 = _mm256_load_pd(&q[ldq*2]); + __m256d a5_1 = _mm256_load_pd(&q[ldq]); + __m256d a6_1 = _mm256_load_pd(&q[0]); + + __m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); + __m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); + __m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); + __m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); + __m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1); + t1 = _mm256_FMA_pd(a4_1, h_6_4, t1); + t1 = _mm256_FMA_pd(a3_1, h_6_3, t1); + t1 = _mm256_FMA_pd(a2_1, h_6_2, t1); + t1 = _mm256_FMA_pd(a1_1, h_6_1, t1); +#else + register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1)); +#endif + __m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); + __m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); + __m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); + __m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1); + v1 = _mm256_FMA_pd(a3_1, h_5_3, v1); + v1 = _mm256_FMA_pd(a2_1, h_5_2, v1); + v1 = _mm256_FMA_pd(a1_1, h_5_1, v1); +#else + register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1)); +#endif + __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); + w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); + w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); +#else + register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); +#endif + __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); + __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); + z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); + register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); +#else + register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); + register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); +#endif + register __m256d x1 = a1_1; + + + __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]); + __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]); + __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]); + __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]); + __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]); + __m256d a6_2 = _mm256_load_pd(&q[4]); + +#ifdef __ELPA_USE_FMA__ + register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2); + t2 = _mm256_FMA_pd(a4_2, h_6_4, t2); + t2 = _mm256_FMA_pd(a3_2, h_6_3, t2); + t2 = _mm256_FMA_pd(a2_2, h_6_2, t2); + t2 = _mm256_FMA_pd(a1_2, h_6_1, t2); + register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2); + v2 = _mm256_FMA_pd(a3_2, h_5_3, v2); + v2 = _mm256_FMA_pd(a2_2, h_5_2, v2); + v2 = _mm256_FMA_pd(a1_2, h_5_1, v2); + register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); + w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); + w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); + register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); + z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); + register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); +#else + register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5)); + t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4)); + t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3)); + t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2)); + t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1)); + register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4)); + v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3)); + v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2)); + v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1)); + register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); + register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); + register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); +#endif + register __m256d x2 = a1_2; + + __m256d q1; + __m256d q2; + + __m256d h1; + __m256d h2; + __m256d h3; + __m256d h4; + __m256d h5; + __m256d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-5]); + q1 = _mm256_load_pd(&q[i*ldq]); + q2 = _mm256_load_pd(&q[(i*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); + w2 = _mm256_FMA_pd(q2, h4, w2); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMA_pd(q1, h5, v1); + v2 = _mm256_FMA_pd(q2, h5, v2); +#else + v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); + v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + t1 = _mm256_FMA_pd(q1, h6, t1); + t2 = _mm256_FMA_pd(q2, h6, t2); +#else + t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6)); + t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6)); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-5]); + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); + w2 = _mm256_FMA_pd(q2, h4, w2); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMA_pd(q1, h5, v1); + v2 = _mm256_FMA_pd(q2, h5, v2); +#else + v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); + v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-4]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); + w2 = _mm256_FMA_pd(q2, h4, w2); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); + w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); + z2 = _mm256_FMA_pd(q2, h3, z2); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); + z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + q1 = _mm256_load_pd(&q[(nb+3)*ldq]); + q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); + y2 = _mm256_FMA_pd(q2, h2, y2); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); + y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+4)*ldq]); + q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); + x2 = _mm256_FMA_pd(q2, h1, x2); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); + x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); +#endif + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m256d tau1 = _mm256_broadcast_sd(&hh[0]); + x1 = _mm256_mul_pd(x1, tau1); + x2 = _mm256_mul_pd(x2, tau1); + + __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); + __m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]); + h2 = _mm256_mul_pd(tau2, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2)); + y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2)); +#else + y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2)); + y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2)); +#endif + + __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); + __m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]); + __m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]); + h2 = _mm256_mul_pd(tau3, vs_1_3); + h3 = _mm256_mul_pd(tau3, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); + z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); +#else + z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); + z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); +#endif + + __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); + __m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]); + __m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]); + h2 = _mm256_mul_pd(tau4, vs_1_4); + h3 = _mm256_mul_pd(tau4, vs_2_4); + __m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]); + h4 = _mm256_mul_pd(tau4, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); + w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); +#else + w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); + w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); +#endif + + __m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]); + __m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]); + __m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]); + h2 = _mm256_mul_pd(tau5, vs_1_5); + h3 = _mm256_mul_pd(tau5, vs_2_5); + __m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]); + __m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]); + h4 = _mm256_mul_pd(tau5, vs_3_5); + h5 = _mm256_mul_pd(tau5, vs_4_5); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); + v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); +#else + v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); + v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); +#endif + + __m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]); + __m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]); + __m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]); + h2 = _mm256_mul_pd(tau6, vs_1_6); + h3 = _mm256_mul_pd(tau6, vs_2_6); + __m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]); + __m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]); + __m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]); + h4 = _mm256_mul_pd(tau6, vs_3_6); + h5 = _mm256_mul_pd(tau6, vs_4_6); + h6 = _mm256_mul_pd(tau6, vs_5_6); +#ifdef __ELPA_USE_FMA__ + t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))))); + t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))))); +#else + t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))))); + t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))))); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [8 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm256_load_pd(&q[0]); + q2 = _mm256_load_pd(&q[4]); + q1 = _mm256_sub_pd(q1, t1); + q2 = _mm256_sub_pd(q2, t2); + _mm256_store_pd(&q[0],q1); + _mm256_store_pd(&q[4],q2); + + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); + q1 = _mm256_load_pd(&q[ldq]); + q2 = _mm256_load_pd(&q[(ldq+4)]); + q1 = _mm256_sub_pd(q1, v1); + q2 = _mm256_sub_pd(q2, v2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[ldq],q1); + _mm256_store_pd(&q[(ldq+4)],q2); + + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); + q1 = _mm256_load_pd(&q[ldq*2]); + q2 = _mm256_load_pd(&q[(ldq*2)+4]); + q1 = _mm256_sub_pd(q1, w1); + q2 = _mm256_sub_pd(q2, w2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[ldq*2],q1); + _mm256_store_pd(&q[(ldq*2)+4],q2); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + q1 = _mm256_load_pd(&q[ldq*3]); + q2 = _mm256_load_pd(&q[(ldq*3)+4]); + q1 = _mm256_sub_pd(q1, z1); + q2 = _mm256_sub_pd(q2, z2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[ldq*3],q1); + _mm256_store_pd(&q[(ldq*3)+4],q2); + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + q1 = _mm256_load_pd(&q[ldq*4]); + q2 = _mm256_load_pd(&q[(ldq*4)+4]); + q1 = _mm256_sub_pd(q1, y1); + q2 = _mm256_sub_pd(q2, y2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[ldq*4],q1); + _mm256_store_pd(&q[(ldq*4)+4],q2); + + h2 = _mm256_broadcast_sd(&hh[(ldh)+1]); + q1 = _mm256_load_pd(&q[ldq*5]); + q2 = _mm256_load_pd(&q[(ldq*5)+4]); + q1 = _mm256_sub_pd(q1, x1); + q2 = _mm256_sub_pd(q2, x2); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[ldq*5],q1); + _mm256_store_pd(&q[(ldq*5)+4],q2); + + for (i = 6; i < nb; i++) + { + q1 = _mm256_load_pd(&q[i*ldq]); + q2 = _mm256_load_pd(&q[(i*ldq)+4]); + h1 = _mm256_broadcast_sd(&hh[i-5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); + q2 = _mm256_NFMA_pd(t2, h6, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); +#endif + _mm256_store_pd(&q[i*ldq],q1); + _mm256_store_pd(&q[(i*ldq)+4],q2); + } + + h1 = _mm256_broadcast_sd(&hh[nb-5]); + q1 = _mm256_load_pd(&q[nb*ldq]); + q2 = _mm256_load_pd(&q[(nb*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); + q2 = _mm256_NFMA_pd(v2, h5, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); +#endif + _mm256_store_pd(&q[nb*ldq],q1); + _mm256_store_pd(&q[(nb*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-4]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); + q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); + q2 = _mm256_NFMA_pd(w2, h4, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); +#endif + _mm256_store_pd(&q[(nb+1)*ldq],q1); + _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); + q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); + q2 = _mm256_NFMA_pd(z2, h3, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); +#endif + _mm256_store_pd(&q[(nb+2)*ldq],q1); + _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + q1 = _mm256_load_pd(&q[(nb+3)*ldq]); + q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); + q2 = _mm256_NFMA_pd(y2, h2, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); +#endif + _mm256_store_pd(&q[(nb+3)*ldq],q1); + _mm256_store_pd(&q[((nb+3)*ldq)+4],q2); + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+4)*ldq]); + q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); + q2 = _mm256_NFMA_pd(x2, h1, q2); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); + q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); +#endif + _mm256_store_pd(&q[(nb+4)*ldq],q1); + _mm256_store_pd(&q[((nb+4)*ldq)+4],q2); +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_AVX_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m256d a1_1 = _mm256_load_pd(&q[ldq*5]); + __m256d a2_1 = _mm256_load_pd(&q[ldq*4]); + __m256d a3_1 = _mm256_load_pd(&q[ldq*3]); + __m256d a4_1 = _mm256_load_pd(&q[ldq*2]); + __m256d a5_1 = _mm256_load_pd(&q[ldq]); + __m256d a6_1 = _mm256_load_pd(&q[0]); + + __m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); + __m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); + __m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); + __m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); + __m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1); + t1 = _mm256_FMA_pd(a4_1, h_6_4, t1); + t1 = _mm256_FMA_pd(a3_1, h_6_3, t1); + t1 = _mm256_FMA_pd(a2_1, h_6_2, t1); + t1 = _mm256_FMA_pd(a1_1, h_6_1, t1); +#else + register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2)); + t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1)); +#endif + __m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); + __m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); + __m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); + __m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1); + v1 = _mm256_FMA_pd(a3_1, h_5_3, v1); + v1 = _mm256_FMA_pd(a2_1, h_5_2, v1); + v1 = _mm256_FMA_pd(a1_1, h_5_1, v1); +#else + register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2)); + v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1)); +#endif + __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); + __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); + w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); + w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); +#else + register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); + w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); +#endif + __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); + __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); + z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); + register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); +#else + register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); + z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); + register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); +#endif + register __m256d x1 = a1_1; + + __m256d q1; + + __m256d h1; + __m256d h2; + __m256d h3; + __m256d h4; + __m256d h5; + __m256d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm256_broadcast_sd(&hh[i-5]); + q1 = _mm256_load_pd(&q[i*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMA_pd(q1, h5, v1); +#else + v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + t1 = _mm256_FMA_pd(q1, h6, t1); +#else + t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6)); +#endif + } + + h1 = _mm256_broadcast_sd(&hh[nb-5]); + q1 = _mm256_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMA_pd(q1, h5, v1); +#else + v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-4]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMA_pd(q1, h4, w1); +#else + w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMA_pd(q1, h3, z1); +#else + z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + q1 = _mm256_load_pd(&q[(nb+3)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMA_pd(q1, h2, y1); +#else + y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); +#endif + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+4)*ldq]); +#ifdef __ELPA_USE_FMA__ + x1 = _mm256_FMA_pd(q1, h1, x1); +#else + x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); +#endif + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m256d tau1 = _mm256_broadcast_sd(&hh[0]); + x1 = _mm256_mul_pd(x1, tau1); + + __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); + __m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]); + h2 = _mm256_mul_pd(tau2, vs_1_2); +#ifdef __ELPA_USE_FMA__ + y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2)); +#else + y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2)); +#endif + + __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); + __m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]); + __m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]); + h2 = _mm256_mul_pd(tau3, vs_1_3); + h3 = _mm256_mul_pd(tau3, vs_2_3); +#ifdef __ELPA_USE_FMA__ + z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); +#else + z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); +#endif + + __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); + __m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]); + __m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]); + h2 = _mm256_mul_pd(tau4, vs_1_4); + h3 = _mm256_mul_pd(tau4, vs_2_4); + __m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]); + h4 = _mm256_mul_pd(tau4, vs_3_4); +#ifdef __ELPA_USE_FMA__ + w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); +#else + w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); +#endif + + __m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]); + __m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]); + __m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]); + h2 = _mm256_mul_pd(tau5, vs_1_5); + h3 = _mm256_mul_pd(tau5, vs_2_5); + __m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]); + __m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]); + h4 = _mm256_mul_pd(tau5, vs_3_5); + h5 = _mm256_mul_pd(tau5, vs_4_5); +#ifdef __ELPA_USE_FMA__ + v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); +#else + v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); +#endif + + __m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]); + __m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]); + __m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]); + h2 = _mm256_mul_pd(tau6, vs_1_6); + h3 = _mm256_mul_pd(tau6, vs_2_6); + __m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]); + __m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]); + __m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]); + h4 = _mm256_mul_pd(tau6, vs_3_6); + h5 = _mm256_mul_pd(tau6, vs_4_6); + h6 = _mm256_mul_pd(tau6, vs_5_6); +#ifdef __ELPA_USE_FMA__ + t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))))); +#else + t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))))); +#endif + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm256_load_pd(&q[0]); + q1 = _mm256_sub_pd(q1, t1); + _mm256_store_pd(&q[0],q1); + + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); + q1 = _mm256_load_pd(&q[ldq]); + q1 = _mm256_sub_pd(q1, v1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[ldq],q1); + + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); + q1 = _mm256_load_pd(&q[ldq*2]); + q1 = _mm256_sub_pd(q1, w1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[ldq*2],q1); + + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); + q1 = _mm256_load_pd(&q[ldq*3]); + q1 = _mm256_sub_pd(q1, z1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[ldq*3],q1); + + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); + q1 = _mm256_load_pd(&q[ldq*4]); + q1 = _mm256_sub_pd(q1, y1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[ldq*4],q1); + + h2 = _mm256_broadcast_sd(&hh[(ldh)+1]); + q1 = _mm256_load_pd(&q[ldq*5]); + q1 = _mm256_sub_pd(q1, x1); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[ldq*5],q1); + + for (i = 6; i < nb; i++) + { + q1 = _mm256_load_pd(&q[i*ldq]); + h1 = _mm256_broadcast_sd(&hh[i-5]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(t1, h6, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); +#endif + _mm256_store_pd(&q[i*ldq],q1); + } + + h1 = _mm256_broadcast_sd(&hh[nb-5]); + q1 = _mm256_load_pd(&q[nb*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(v1, h5, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); +#endif + _mm256_store_pd(&q[nb*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-4]); + q1 = _mm256_load_pd(&q[(nb+1)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(w1, h4, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); +#endif + _mm256_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-3]); + q1 = _mm256_load_pd(&q[(nb+2)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(z1, h3, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); +#endif + _mm256_store_pd(&q[(nb+2)*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-2]); + q1 = _mm256_load_pd(&q[(nb+3)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(y1, h2, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); +#endif + _mm256_store_pd(&q[(nb+3)*ldq],q1); + + h1 = _mm256_broadcast_sd(&hh[nb-1]); + q1 = _mm256_load_pd(&q[(nb+4)*ldq]); +#ifdef __ELPA_USE_FMA__ + q1 = _mm256_NFMA_pd(x1, h1, q1); +#else + q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); +#endif + _mm256_store_pd(&q[(nb+4)*ldq],q1); +} + diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c similarity index 93% rename from src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c rename to src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c index acd891d6ff2d891f06de17bd25df0f6659885b58..4e3f396f6ad330dd97039dbb7a68c80b1eaba613 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c @@ -72,16 +72,16 @@ #endif //Forward declaration -__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); -__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); -__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_4_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_8_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s); -void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void double_hh_trafo_real_sse_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void double_hh_trafo_real_sse_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -102,7 +102,7 @@ void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, in // Production level kernel calls with padding for (i = 0; i < nq-8; i+=12) { - hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } if (nq == i) { @@ -112,11 +112,11 @@ void double_hh_trafo_real_sse_2hv_(double* q, double* hh, int* pnb, int* pnq, in { if (nq-i > 4) { - hh_trafo_kernel_8_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { - hh_trafo_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } } } @@ -144,12 +144,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, #ifdef __AVX__ for (i = 0; i < nq; i+=24) { - hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #else for (i = 0; i < nq; i+=12) { - hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); + hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); } #endif } @@ -160,7 +160,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ - __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) + __forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [12 x nb+1] * hh @@ -352,7 +352,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, * matrix vector product with two householder * vectors + a rank 2 update is performed */ -__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) +__forceinline void hh_trafo_kernel_8_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+1] * hh @@ -500,7 +500,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 2 update is performed */ -__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) +__forceinline void hh_trafo_kernel_4_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+1] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..ab22b9259abffbcd7d7484272f7119ee81f99bb0 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c @@ -0,0 +1,599 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_4_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_8_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s); +__forceinline void hh_trafo_kernel_12_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s); + +void double_hh_trafo_real_sse_2hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void double_hh_trafo_real_sse_2hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + double s = hh[(ldh)+1]*1.0; + + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding + for (i = 0; i < nq-8; i+=12) + { + hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 4) + { + hh_trafo_kernel_8_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + else if (nq-i > 0) + { + hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } + } +} + +#if 0 +void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar product to compute + // 2 householder vectors simultaneously + double s = hh[(ldh)+1]*1.0; + + #pragma ivdep + for (i = 2; i < nb; i++) + { + s += hh[i-1] * hh[(i+ldh)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=24) + { + hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#else + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s); + } +#endif +} +#endif +/** + * Unrolled kernel that computes + * 12 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ + __forceinline void hh_trafo_kernel_12_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [12 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + __m128d x3 = _mm_load_pd(&q[ldq+4]); + __m128d x4 = _mm_load_pd(&q[ldq+6]); + __m128d x5 = _mm_load_pd(&q[ldq+8]); + __m128d x6 = _mm_load_pd(&q[ldq+10]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + __m128d q5 = _mm_load_pd(&q[8]); + __m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); + __m128d q6 = _mm_load_pd(&q[10]); + __m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); + + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); + y5 = _mm_add_pd(y5, _mm_mul_pd(q5,h2)); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); + y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2)); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [12 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + x4 = _mm_mul_pd(x4, h1); + x5 = _mm_mul_pd(x5, h1); + x6 = _mm_mul_pd(x6, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); + + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); + y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); + y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2)); + y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2)); + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + q3 = _mm_load_pd(&q[4]); + q3 = _mm_add_pd(q3, y3); + _mm_store_pd(&q[4],q3); + q4 = _mm_load_pd(&q[6]); + q4 = _mm_add_pd(q4, y4); + _mm_store_pd(&q[6],q4); + q5 = _mm_load_pd(&q[8]); + q5 = _mm_add_pd(q5, y5); + _mm_store_pd(&q[8],q5); + q6 = _mm_load_pd(&q[10]); + q6 = _mm_add_pd(q6, y6); + _mm_store_pd(&q[10],q6); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[ldq+6],q4); + q5 = _mm_load_pd(&q[ldq+8]); + q5 = _mm_add_pd(q5, _mm_add_pd(x5, _mm_mul_pd(y5, h2))); + _mm_store_pd(&q[ldq+8],q5); + q6 = _mm_load_pd(&q[ldq+10]); + q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2))); + _mm_store_pd(&q[ldq+10],q6); + + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); + q5 = _mm_load_pd(&q[(i*ldq)+8]); + q5 = _mm_add_pd(q5, _mm_add_pd(_mm_mul_pd(x5,h1), _mm_mul_pd(y5, h2))); + _mm_store_pd(&q[(i*ldq)+8],q5); + q6 = _mm_load_pd(&q[(i*ldq)+10]); + q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2))); + _mm_store_pd(&q[(i*ldq)+10],q6); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + _mm_store_pd(&q[(nb*ldq)+6],q4); + q5 = _mm_load_pd(&q[(nb*ldq)+8]); + q5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); + _mm_store_pd(&q[(nb*ldq)+8],q5); + q6 = _mm_load_pd(&q[(nb*ldq)+10]); + q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); + _mm_store_pd(&q[(nb*ldq)+10],q6); +} + +/** + * Unrolled kernel that computes + * 8 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ +__forceinline void hh_trafo_kernel_8_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + __m128d x3 = _mm_load_pd(&q[ldq+4]); + __m128d x4 = _mm_load_pd(&q[ldq+6]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + __m128d q3 = _mm_load_pd(&q[4]); + __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + __m128d q4 = _mm_load_pd(&q[6]); + __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [8 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + x4 = _mm_mul_pd(x4, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); + + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); + y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + q3 = _mm_load_pd(&q[4]); + q3 = _mm_add_pd(q3, y3); + _mm_store_pd(&q[4],q3); + q4 = _mm_load_pd(&q[6]); + q4 = _mm_add_pd(q4, y4); + _mm_store_pd(&q[6],q4); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); + q3 = _mm_load_pd(&q[ldq+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[ldq+4],q3); + q4 = _mm_load_pd(&q[ldq+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[ldq+6],q4); + + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); + _mm_store_pd(&q[(i*ldq)+4],q3); + q4 = _mm_load_pd(&q[(i*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); + _mm_store_pd(&q[(i*ldq)+6],q4); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); + _mm_store_pd(&q[(nb*ldq)+4],q3); + q4 = _mm_load_pd(&q[(nb*ldq)+6]); + q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); + _mm_store_pd(&q[(nb*ldq)+6],q4); +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 2 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_2hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+1] * hh + // hh contains two householder vectors, with offset 1 + ///////////////////////////////////////////////////// + int i; + // Needed bit mask for floating point sign flip + __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); + __m128d sign = (__m128d)_mm_set1_epi64(smallsign); + + __m128d x1 = _mm_load_pd(&q[ldq]); + __m128d x2 = _mm_load_pd(&q[ldq+2]); + + __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h2; + + __m128d q1 = _mm_load_pd(q); + __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + __m128d q2 = _mm_load_pd(&q[2]); + __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + + for(i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + ///////////////////////////////////////////////////// + // Rank-2 update of Q [12 x nb+1] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(hh); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs = _mm_loaddup_pd(&s); + + h1 = _mm_xor_pd(tau1, sign); + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + h1 = _mm_xor_pd(tau2, sign); + h2 = _mm_mul_pd(h1, vs); + + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + + q1 = _mm_load_pd(q); + q1 = _mm_add_pd(q1, y1); + _mm_store_pd(q,q1); + q2 = _mm_load_pd(&q[2]); + q2 = _mm_add_pd(q2, y2); + _mm_store_pd(&q[2],q2); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[ldq],q1); + q2 = _mm_load_pd(&q[ldq+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[ldq+2],q2); + + for (i = 2; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-1]); + h2 = _mm_loaddup_pd(&hh[ldh+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); + _mm_store_pd(&q[i*ldq],q1); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); + _mm_store_pd(&q[(i*ldq)+2],q2); + } + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); + _mm_store_pd(&q[nb*ldq],q1); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); + _mm_store_pd(&q[(nb*ldq)+2],q2); +} diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c similarity index 93% rename from src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c rename to src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c index 664d0434343b620648b61fb7d9a0b0ca88d31248..09c8030fa57a54742992c72ba74a0a015f319f9a 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c @@ -71,16 +71,16 @@ #endif //Forward declaration -__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); -void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void quad_hh_trafo_real_sse_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 -void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void quad_hh_trafo_real_sse_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -134,7 +134,7 @@ void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* // Production level kernel calls with padding for (i = 0; i < nq-4; i+=6) { - hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { @@ -144,17 +144,17 @@ void quad_hh_trafo_real_sse_4hv_(double* q, double* hh, int* pnb, int* pnq, int* { if (nq-i > 2) { - hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { - hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } } #if 0 -void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -202,12 +202,12 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i #ifdef __AVX__ for (i = 0; i < nq; i+=12) { - hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #else for (i = 0; i < nq; i+=6) { - hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #endif } @@ -218,7 +218,7 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [6 x nb+3] * hh @@ -576,7 +576,7 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh @@ -820,7 +820,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +__forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [2 x nb+3] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..bf3b9064bc35bd307a21de6a8e9ace67fc1198e1 --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c @@ -0,0 +1,995 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +__forceinline void hh_trafo_kernel_2_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_4_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_6_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); + +void quad_hh_trafo_real_sse_4hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void quad_hh_trafo_fast_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void quad_hh_trafo_real_sse_4hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + +// printf("s_1_2: %f\n", s_1_2); +// printf("s_1_3: %f\n", s_1_3); +// printf("s_2_3: %f\n", s_2_3); +// printf("s_1_4: %f\n", s_1_4); +// printf("s_2_4: %f\n", s_2_4); +// printf("s_3_4: %f\n", s_3_4); + + // Production level kernel calls with padding + for (i = 0; i < nq-4; i+=6) + { + hh_trafo_kernel_6_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + if (nq == i) + { + return; + } + else + { + if (nq-i > 2) + { + hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + else + { + hh_trafo_kernel_2_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } + } +} + +#if 0 +void quad_hh_trafo_fast_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 4 householder vectors simultaneously + double s_1_2 = hh[(ldh)+1]; + double s_1_3 = hh[(ldh*2)+2]; + double s_2_3 = hh[(ldh*2)+1]; + double s_1_4 = hh[(ldh*3)+3]; + double s_2_4 = hh[(ldh*3)+2]; + double s_3_4 = hh[(ldh*3)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + s_1_2 += hh[2-1] * hh[(2+ldh)]; + s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; + s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; + + // loop counter = 3 + s_1_2 += hh[3-1] * hh[(3+ldh)]; + s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; + s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; + + s_1_3 += hh[3-2] * hh[3+(ldh*2)]; + s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; + + #pragma ivdep + for (i = 4; i < nb; i++) + { + s_1_2 += hh[i-1] * hh[(i+ldh)]; + s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + + s_1_3 += hh[i-2] * hh[i+(ldh*2)]; + s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + + s_1_4 += hh[i-3] * hh[i+(ldh*3)]; + } + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=12) + { + hh_trafo_kernel_12_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#else + for (i = 0; i < nq; i+=6) + { + hh_trafo_kernel_6_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); + } +#endif +} +#endif +/** + * Unrolled kernel that computes + * 6 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_6_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [6 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + register __m128d x1 = a1_1; + + __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a3_2 = _mm_load_pd(&q[ldq+2]); + __m128d a4_2 = _mm_load_pd(&q[0+2]); + + register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); + register __m128d x2 = a1_2; + + __m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]); + __m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]); + __m128d a3_3 = _mm_load_pd(&q[ldq+4]); + __m128d a4_3 = _mm_load_pd(&q[0+4]); + + register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3)); + w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2)); + w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1)); + register __m128d z3 = _mm_add_pd(a3_3, _mm_mul_pd(a2_3, h_3_2)); + z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1)); + register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1)); + register __m128d x3 = a1_3; + + __m128d q1; + __m128d q2; + __m128d q3; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4)); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [6 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + x3 = _mm_mul_pd(x3, h1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); + + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); + + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); + z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))); + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); + + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); + w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)))); + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q3 = _mm_load_pd(&q[4]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); + q3 = _mm_sub_pd(q3, w3); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + _mm_store_pd(&q[4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[ldq+2]); + q3 = _mm_load_pd(&q[ldq+4]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); + q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); + q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4))); + + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[ldq+2],q2); + _mm_store_pd(&q[ldq+4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); + q3 = _mm_load_pd(&q[(ldq*2)+4]); + q1 = _mm_sub_pd(q1, y1); + q2 = _mm_sub_pd(q2, y2); + q3 = _mm_sub_pd(q3, y3); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); + + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + _mm_store_pd(&q[(ldq*2)+4],q3); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); + q3 = _mm_load_pd(&q[(ldq*3)+4]); + q1 = _mm_sub_pd(q1, x1); + q2 = _mm_sub_pd(q2, x2); + q3 = _mm_sub_pd(q3, x3); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); + _mm_store_pd(&q[ldq*3], q1); + _mm_store_pd(&q[(ldq*3)+2], q2); + _mm_store_pd(&q[(ldq*3)+4], q3); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + q3 = _mm_load_pd(&q[(i*ldq)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4)); + + _mm_store_pd(&q[i*ldq],q1); + _mm_store_pd(&q[(i*ldq)+2],q2); + _mm_store_pd(&q[(i*ldq)+4],q3); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + q3 = _mm_load_pd(&q[(nb*ldq)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); + + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + _mm_store_pd(&q[(nb*ldq)+4],q3); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); + + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + _mm_store_pd(&q[((nb+1)*ldq)+4],q3); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); + + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); + _mm_store_pd(&q[((nb+2)*ldq)+4],q3); +} + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + __m128d x1 = a1_1; + + __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a3_2 = _mm_load_pd(&q[ldq+2]); + __m128d a4_2 = _mm_load_pd(&q[0+2]); + + __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); + __m128d x2 = a1_2; + + __m128d q1; + __m128d q2; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + + q2 = _mm_load_pd(&q[(i*ldq)+2]); + + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); + + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + x2 = _mm_mul_pd(x2, h1); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); + + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); + + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); + + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[ldq+2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); + q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); + + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[ldq+2],q2); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); + q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))); + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); + q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))))); + + _mm_store_pd(&q[ldq*3], q1); + _mm_store_pd(&q[(ldq*3)+2], q2); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); + + _mm_store_pd(&q[i*ldq],q1); + + q2 = _mm_load_pd(&q[(i*ldq)+2]); + + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)))); + + _mm_store_pd(&q[(i*ldq)+2],q2); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1))); + + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); + q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1))); + + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); +} + +/** + * Unrolled kernel that computes + * 2 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_2_SSE_4hv_single(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [2 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*3]); + __m128d a2_1 = _mm_load_pd(&q[ldq*2]); + __m128d a3_1 = _mm_load_pd(&q[ldq]); + __m128d a4_1 = _mm_load_pd(&q[0]); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + __m128d x1 = a1_1; + + __m128d q1; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + + for(i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + ///////////////////////////////////////////////////// + // Rank-1 update of Q [2 x nb+3] + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + + __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); + __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); + __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); + __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); + __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); + + h1 = tau1; + x1 = _mm_mul_pd(x1, h1); + + h1 = tau2; + h2 = _mm_mul_pd(h1, vs_1_2); + + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); + + h1 = tau3; + h2 = _mm_mul_pd(h1, vs_1_3); + h3 = _mm_mul_pd(h1, vs_2_3); + + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + + h1 = tau4; + h2 = _mm_mul_pd(h1, vs_1_4); + h3 = _mm_mul_pd(h1, vs_2_4); + h4 = _mm_mul_pd(h1, vs_3_4); + + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + + q1 = _mm_load_pd(&q[0]); + q1 = _mm_sub_pd(q1, w1); + _mm_store_pd(&q[0],q1); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); + + _mm_store_pd(&q[ldq],q1); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + q1 = _mm_load_pd(&q[ldq*2]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); + + _mm_store_pd(&q[ldq*2],q1); + + h2 = _mm_loaddup_pd(&hh[ldh+1]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + q1 = _mm_load_pd(&q[ldq*3]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); + + _mm_store_pd(&q[ldq*3], q1); + + for (i = 4; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-3]); + h2 = _mm_loaddup_pd(&hh[ldh+i-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); + + q1 = _mm_load_pd(&q[i*ldq]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); + + _mm_store_pd(&q[i*ldq],q1); + } + + h1 = _mm_loaddup_pd(&hh[nb-3]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + q1 = _mm_load_pd(&q[nb*ldq]); + + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); + + _mm_store_pd(&q[nb*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); + + _mm_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + _mm_store_pd(&q[(nb+2)*ldq],q1); +} diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c similarity index 97% rename from src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c rename to src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c index bc19037ca2397b22835707c607a311d6e58210c5..32a6be9602c793c436c21adeba3c1e9d12ca0b23 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c @@ -71,15 +71,15 @@ #endif //Forward declaration -static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); -static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); -void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +void hexa_hh_trafo_real_sse_6hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #if 0 void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif -void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +void hexa_hh_trafo_real_sse_6hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; @@ -221,7 +221,7 @@ void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* // Production level kernel calls with padding for (i = 0; i < nq-2; i+=4) { - hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { @@ -229,7 +229,7 @@ void hexa_hh_trafo_real_sse_6hv_(double* q, double* hh, int* pnb, int* pnq, int* } else { - hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } } @@ -377,12 +377,12 @@ void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i #ifdef __AVX__ for (i = 0; i < nq; i+=8) { - hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq; i+=4) { - hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } @@ -394,7 +394,7 @@ void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh @@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +__forceinline void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [2 x nb+3] * hh diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c new file mode 100644 index 0000000000000000000000000000000000000000..e2ef06277818ad1acbdf3b2bb8c438a04104afdd --- /dev/null +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c @@ -0,0 +1,1343 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file contains the compute intensive kernels for the Householder transformations. +// It should be compiled with the highest possible optimization level. +// +// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 +// On Intel Sandy Bridge use -O3 -mavx +// +// Copyright of the original code rests with the authors inside the ELPA +// consortium. The copyright of any additional modifications shall rest +// with their original authors, but shall adhere to the licensing terms +// distributed along with the original code in the file "COPYING". +// +// Author: Alexander Heinecke (alexander.heinecke@mytum.de) +// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- + +#include "config-f90.h" + +#include + +#define __forceinline __attribute__((always_inline)) static + +#ifdef HAVE_SSE +#undef __AVX__ +#endif + +//Forward declaration +static void hh_trafo_kernel_2_SSE_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); +static void hh_trafo_kernel_4_SSE_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); + +void hexa_hh_trafo_real_sse_6hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif + +void hexa_hh_trafo_real_sse_6hv_single_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding + for (i = 0; i < nq-2; i+=4) + { + hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } + if (nq == i) + { + return; + } + else + { + hh_trafo_kernel_2_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +} + +#if 0 +void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +{ + int i; + int nb = *pnb; + int nq = *pldq; + int ldq = *pldq; + int ldh = *pldh; + + // calculating scalar products to compute + // 6 householder vectors simultaneously + double scalarprods[15]; + +// scalarprods[0] = s_1_2; +// scalarprods[1] = s_1_3; +// scalarprods[2] = s_2_3; +// scalarprods[3] = s_1_4; +// scalarprods[4] = s_2_4; +// scalarprods[5] = s_3_4; +// scalarprods[6] = s_1_5; +// scalarprods[7] = s_2_5; +// scalarprods[8] = s_3_5; +// scalarprods[9] = s_4_5; +// scalarprods[10] = s_1_6; +// scalarprods[11] = s_2_6; +// scalarprods[12] = s_3_6; +// scalarprods[13] = s_4_6; +// scalarprods[14] = s_5_6; + + scalarprods[0] = hh[(ldh+1)]; + scalarprods[1] = hh[(ldh*2)+2]; + scalarprods[2] = hh[(ldh*2)+1]; + scalarprods[3] = hh[(ldh*3)+3]; + scalarprods[4] = hh[(ldh*3)+2]; + scalarprods[5] = hh[(ldh*3)+1]; + scalarprods[6] = hh[(ldh*4)+4]; + scalarprods[7] = hh[(ldh*4)+3]; + scalarprods[8] = hh[(ldh*4)+2]; + scalarprods[9] = hh[(ldh*4)+1]; + scalarprods[10] = hh[(ldh*5)+5]; + scalarprods[11] = hh[(ldh*5)+4]; + scalarprods[12] = hh[(ldh*5)+3]; + scalarprods[13] = hh[(ldh*5)+2]; + scalarprods[14] = hh[(ldh*5)+1]; + + // calculate scalar product of first and fourth householder vector + // loop counter = 2 + scalarprods[0] += hh[1] * hh[(2+ldh)]; + scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; + + // loop counter = 3 + scalarprods[0] += hh[2] * hh[(3+ldh)]; + scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; + + scalarprods[1] += hh[1] * hh[3+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; + + // loop counter = 4 + scalarprods[0] += hh[3] * hh[(4+ldh)]; + scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; + + scalarprods[1] += hh[2] * hh[4+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; + + scalarprods[3] += hh[1] * hh[4+(ldh*3)]; + scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; + + // loop counter = 5 + scalarprods[0] += hh[4] * hh[(5+ldh)]; + scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; + + scalarprods[1] += hh[3] * hh[5+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; + + scalarprods[3] += hh[2] * hh[5+(ldh*3)]; + scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; + + scalarprods[6] += hh[1] * hh[5+(ldh*4)]; + scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; + + #pragma ivdep + for (i = 6; i < nb; i++) + { + scalarprods[0] += hh[i-1] * hh[(i+ldh)]; + scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; + scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; + scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; + scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; + + scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; + scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; + scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; + scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; + + scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; + scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; + scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; + + scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; + scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; + + scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; + } + +// printf("s_1_2: %f\n", scalarprods[0]); +// printf("s_1_3: %f\n", scalarprods[1]); +// printf("s_2_3: %f\n", scalarprods[2]); +// printf("s_1_4: %f\n", scalarprods[3]); +// printf("s_2_4: %f\n", scalarprods[4]); +// printf("s_3_4: %f\n", scalarprods[5]); +// printf("s_1_5: %f\n", scalarprods[6]); +// printf("s_2_5: %f\n", scalarprods[7]); +// printf("s_3_5: %f\n", scalarprods[8]); +// printf("s_4_5: %f\n", scalarprods[9]); +// printf("s_1_6: %f\n", scalarprods[10]); +// printf("s_2_6: %f\n", scalarprods[11]); +// printf("s_3_6: %f\n", scalarprods[12]); +// printf("s_4_6: %f\n", scalarprods[13]); +// printf("s_5_6: %f\n", scalarprods[14]); + + // Production level kernel calls with padding +#ifdef __AVX__ + for (i = 0; i < nq; i+=8) + { + hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#else + for (i = 0; i < nq; i+=4) + { + hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + } +#endif +} +#endif + +/** + * Unrolled kernel that computes + * 4 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_4_SSE_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [4 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*5]); + __m128d a2_1 = _mm_load_pd(&q[ldq*4]); + __m128d a3_1 = _mm_load_pd(&q[ldq*3]); + __m128d a4_1 = _mm_load_pd(&q[ldq*2]); + __m128d a5_1 = _mm_load_pd(&q[ldq]); + __m128d a6_1 = _mm_load_pd(&q[0]); + + __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); + + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); + + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); + + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); + + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + + register __m128d x1 = a1_1; + + __m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]); + __m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]); + __m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]); + __m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]); + __m128d a5_2 = _mm_load_pd(&q[(ldq)+2]); + __m128d a6_2 = _mm_load_pd(&q[2]); + + register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2)); + t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1)); + register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2)); + v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1)); + register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); + w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); + register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); + z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); + register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); + + register __m128d x2 = a1_2; + + __m128d q1; + __m128d q2; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + __m128d h5; + __m128d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-5]); + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); + + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); + + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); + t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6)); + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); + + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + x1 = _mm_mul_pd(x1, tau1); + x2 = _mm_mul_pd(x2, tau1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); + h2 = _mm_mul_pd(tau2, vs_1_2); + + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); + y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2)); + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); + __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); + h2 = _mm_mul_pd(tau3, vs_1_3); + h3 = _mm_mul_pd(tau3, vs_2_3); + + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); + __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); + h2 = _mm_mul_pd(tau4, vs_1_4); + h3 = _mm_mul_pd(tau4, vs_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); + h4 = _mm_mul_pd(tau4, vs_3_4); + + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); + + __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); + __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); + __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); + h2 = _mm_mul_pd(tau5, vs_1_5); + h3 = _mm_mul_pd(tau5, vs_2_5); + __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); + __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); + h4 = _mm_mul_pd(tau5, vs_3_5); + h5 = _mm_mul_pd(tau5, vs_4_5); + + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); + + __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); + __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); + __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); + h2 = _mm_mul_pd(tau6, vs_1_6); + h3 = _mm_mul_pd(tau6, vs_2_6); + __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); + __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); + __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); + h4 = _mm_mul_pd(tau6, vs_3_6); + h5 = _mm_mul_pd(tau6, vs_4_6); + h6 = _mm_mul_pd(tau6, vs_5_6); + + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); + t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))))); + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [4 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm_load_pd(&q[0]); + q2 = _mm_load_pd(&q[2]); + q1 = _mm_sub_pd(q1, t1); + q2 = _mm_sub_pd(q2, t2); + _mm_store_pd(&q[0],q1); + _mm_store_pd(&q[2],q2); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + q1 = _mm_load_pd(&q[ldq]); + q2 = _mm_load_pd(&q[(ldq+2)]); + q1 = _mm_sub_pd(q1, v1); + q2 = _mm_sub_pd(q2, v2); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[ldq],q1); + _mm_store_pd(&q[(ldq+2)],q2); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + q1 = _mm_load_pd(&q[ldq*2]); + q2 = _mm_load_pd(&q[(ldq*2)+2]); + q1 = _mm_sub_pd(q1, w1); + q2 = _mm_sub_pd(q2, w2); + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[ldq*2],q1); + _mm_store_pd(&q[(ldq*2)+2],q2); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq*3]); + q2 = _mm_load_pd(&q[(ldq*3)+2]); + q1 = _mm_sub_pd(q1, z1); + q2 = _mm_sub_pd(q2, z2); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[ldq*3],q1); + _mm_store_pd(&q[(ldq*3)+2],q2); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + q1 = _mm_load_pd(&q[ldq*4]); + q2 = _mm_load_pd(&q[(ldq*4)+2]); + q1 = _mm_sub_pd(q1, y1); + q2 = _mm_sub_pd(q2, y2); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[ldq*4],q1); + _mm_store_pd(&q[(ldq*4)+2],q2); + + h2 = _mm_loaddup_pd(&hh[(ldh)+1]); + q1 = _mm_load_pd(&q[ldq*5]); + q2 = _mm_load_pd(&q[(ldq*5)+2]); + q1 = _mm_sub_pd(q1, x1); + q2 = _mm_sub_pd(q2, x2); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[ldq*5],q1); + _mm_store_pd(&q[(ldq*5)+2],q2); + + for (i = 6; i < nb; i++) + { + q1 = _mm_load_pd(&q[i*ldq]); + q2 = _mm_load_pd(&q[(i*ldq)+2]); + h1 = _mm_loaddup_pd(&hh[i-5]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); + + _mm_store_pd(&q[i*ldq],q1); + _mm_store_pd(&q[(i*ldq)+2],q2); + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + q2 = _mm_load_pd(&q[(nb*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); + + _mm_store_pd(&q[nb*ldq],q1); + _mm_store_pd(&q[(nb*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); + + _mm_store_pd(&q[(nb+1)*ldq],q1); + _mm_store_pd(&q[((nb+1)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); + + _mm_store_pd(&q[(nb+2)*ldq],q1); + _mm_store_pd(&q[((nb+2)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); + + _mm_store_pd(&q[(nb+3)*ldq],q1); + _mm_store_pd(&q[((nb+3)*ldq)+2],q2); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); + + _mm_store_pd(&q[(nb+4)*ldq],q1); + _mm_store_pd(&q[((nb+4)*ldq)+2],q2); +} + +/** + * Unrolled kernel that computes + * 2 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_2_SSE_6hv_single(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [2 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m128d a1_1 = _mm_load_pd(&q[ldq*5]); + __m128d a2_1 = _mm_load_pd(&q[ldq*4]); + __m128d a3_1 = _mm_load_pd(&q[ldq*3]); + __m128d a4_1 = _mm_load_pd(&q[ldq*2]); + __m128d a5_1 = _mm_load_pd(&q[ldq]); + __m128d a6_1 = _mm_load_pd(&q[0]); + + __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); + + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); + t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); + + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); + + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); + v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); + + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); + w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); + + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); + __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); + z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); + register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); + + register __m128d x1 = a1_1; + + __m128d q1; + + __m128d h1; + __m128d h2; + __m128d h3; + __m128d h4; + __m128d h5; + __m128d h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm_loaddup_pd(&hh[i-5]); + q1 = _mm_load_pd(&q[i*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); + + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); + + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); + + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); + + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); + + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); + + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m128d tau1 = _mm_loaddup_pd(&hh[0]); + x1 = _mm_mul_pd(x1, tau1); + + __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); + __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); + h2 = _mm_mul_pd(tau2, vs_1_2); + + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); + + __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); + __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); + __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); + h2 = _mm_mul_pd(tau3, vs_1_3); + h3 = _mm_mul_pd(tau3, vs_2_3); + + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); + + __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); + __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); + __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); + h2 = _mm_mul_pd(tau4, vs_1_4); + h3 = _mm_mul_pd(tau4, vs_2_4); + __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); + h4 = _mm_mul_pd(tau4, vs_3_4); + + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + + __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); + __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); + __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); + h2 = _mm_mul_pd(tau5, vs_1_5); + h3 = _mm_mul_pd(tau5, vs_2_5); + __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); + __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); + h4 = _mm_mul_pd(tau5, vs_3_5); + h5 = _mm_mul_pd(tau5, vs_4_5); + + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); + + __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); + __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); + __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); + h2 = _mm_mul_pd(tau6, vs_1_6); + h3 = _mm_mul_pd(tau6, vs_2_6); + __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); + __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); + __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); + h4 = _mm_mul_pd(tau6, vs_3_6); + h5 = _mm_mul_pd(tau6, vs_4_6); + h6 = _mm_mul_pd(tau6, vs_5_6); + + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [2 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm_load_pd(&q[0]); + q1 = _mm_sub_pd(q1, t1); + _mm_store_pd(&q[0],q1); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); + q1 = _mm_load_pd(&q[ldq]); + q1 = _mm_sub_pd(q1, v1); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[ldq],q1); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); + q1 = _mm_load_pd(&q[ldq*2]); + q1 = _mm_sub_pd(q1, w1); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[ldq*2],q1); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); + q1 = _mm_load_pd(&q[ldq*3]); + q1 = _mm_sub_pd(q1, z1); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[ldq*3],q1); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); + q1 = _mm_load_pd(&q[ldq*4]); + q1 = _mm_sub_pd(q1, y1); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[ldq*4],q1); + + h2 = _mm_loaddup_pd(&hh[(ldh)+1]); + q1 = _mm_load_pd(&q[ldq*5]); + q1 = _mm_sub_pd(q1, x1); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[ldq*5],q1); + + for (i = 6; i < nb; i++) + { + q1 = _mm_load_pd(&q[i*ldq]); + h1 = _mm_loaddup_pd(&hh[i-5]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); + + _mm_store_pd(&q[i*ldq],q1); + } + + h1 = _mm_loaddup_pd(&hh[nb-5]); + q1 = _mm_load_pd(&q[nb*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); + + _mm_store_pd(&q[nb*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-4]); + q1 = _mm_load_pd(&q[(nb+1)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); + + _mm_store_pd(&q[(nb+1)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-3]); + q1 = _mm_load_pd(&q[(nb+2)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); + + _mm_store_pd(&q[(nb+2)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-2]); + q1 = _mm_load_pd(&q[(nb+3)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); + + _mm_store_pd(&q[(nb+3)*ldq],q1); + + h1 = _mm_loaddup_pd(&hh[nb-1]); + q1 = _mm_load_pd(&q[(nb+4)*ldq]); + + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); + + _mm_store_pd(&q[(nb+4)*ldq],q1); +} diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90 index 0fcfa7f417daf124edd82d9ed73e35587936adc5..08a654fcea7a01a41f8617c5a9e89674ab4df0a4 100644 --- a/src/mod_compute_hh_trafo_complex.F90 +++ b/src/mod_compute_hh_trafo_complex.F90 @@ -151,18 +151,18 @@ module compute_hh_trafo_complex w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), & + call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo #ifdef WITH_OPENMP - if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), & + if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe,my_thread), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #else - if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), & + if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #endif #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) @@ -180,18 +180,18 @@ module compute_hh_trafo_complex w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), & + call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo #ifdef WITH_OPENMP - if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe,my_thread), & + if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe,my_thread), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #else - if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe), & + if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #endif #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) @@ -288,11 +288,11 @@ module compute_hh_trafo_complex !#if defined(WITH_AVX_SANDYBRIDGE) -! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) +! call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) !#endif !#if defined(WITH_AMD_BULLDOZER) -! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) +! call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) !#endif #if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) @@ -302,10 +302,10 @@ module compute_hh_trafo_complex ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP - call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), & + call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe,my_thread), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #else - call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), & + call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo @@ -322,10 +322,10 @@ module compute_hh_trafo_complex ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP - call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe,my_thread), & + call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe,my_thread), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #else - call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe), & + call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo @@ -447,18 +447,18 @@ module compute_hh_trafo_complex w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_complex_sse_avx_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_complex_sse_avx_2hv_single(a(1,j+off+a_off-1,istripe), & + call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo #ifdef WITH_OPENMP - if (j==1) call single_hh_trafo_complex_sse_avx_1hv_single(a(1,1+off+a_off,istripe,my_thread), & + if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe,my_thread), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #else - if (j==1) call single_hh_trafo_complex_sse_avx_1hv_single(a(1,1+off+a_off,istripe), & + if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe), & bcast_buffer(1,off+1), nbw, nl, stripe_width) #endif #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) @@ -555,11 +555,11 @@ module compute_hh_trafo_complex !#if defined(WITH_AVX_SANDYBRIDGE) -! call single_hh_trafo_complex_sse_avx_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) +! call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) !#endif !#if defined(WITH_AMD_BULLDOZER) -! call single_hh_trafo_complex_sse_avx_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) +! call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width) !#endif #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) @@ -569,10 +569,10 @@ module compute_hh_trafo_complex ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP - call single_hh_trafo_complex_sse_avx_1hv_double_single(a(1,j+off+a_off,istripe,my_thread), & + call single_hh_trafo_complex_avx_avx2_1hv_double_single(a(1,j+off+a_off,istripe,my_thread), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #else - call single_hh_trafo_complex_sse_avx_1hv_single(a(1,j+off+a_off,istripe), & + call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #endif enddo diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90 index 39a9cc987afe225f435881f63002e1d5bd803203..c574f98e4c938be003a16b8e2f8b58e3554f6a57 100644 --- a/src/mod_compute_hh_trafo_real.F90 +++ b/src/mod_compute_hh_trafo_real.F90 @@ -299,10 +299,10 @@ module compute_hh_trafo_real w(:,1) = bcast_buffer(1:nbw,j+off) w(:,2) = bcast_buffer(1:nbw,j+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_sse_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe), & + call double_hh_trafo_real_sse_2hv_double(a(1,j+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo @@ -322,10 +322,10 @@ module compute_hh_trafo_real w(:,2) = bcast_buffer(1:nbw,j+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo @@ -377,7 +377,7 @@ module compute_hh_trafo_real !#if defined(WITH_AVX_SANDYBRIDGE) -! call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw) +! call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw) !#endif #ifdef WITH_OPENMP @@ -407,10 +407,10 @@ module compute_hh_trafo_real w(:,3) = bcast_buffer(1:nbw,j+off-2) w(:,4) = bcast_buffer(1:nbw,j+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_avx_avx2_4hv_double(a(1,j+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_avx_avx2_4hv_double(a(1,j+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -418,10 +418,10 @@ module compute_hh_trafo_real w(:,1) = bcast_buffer(1:nbw,jj+off) w(:,2) = bcast_buffer(1:nbw,jj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,jj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,jj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo @@ -451,10 +451,10 @@ module compute_hh_trafo_real w(:,5) = bcast_buffer(1:nbw,j+off-4) w(:,6) = bcast_buffer(1:nbw,j+off-5) #ifdef WITH_OPENMP - call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, & + call hexa_hh_trafo_real_avx_avx2_6hv_double(a(1,j+off+a_off-5,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe), w, & + call hexa_hh_trafo_real_avx_avx2_6hv_double(a(1,j+off+a_off-5,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -464,10 +464,10 @@ module compute_hh_trafo_real w(:,3) = bcast_buffer(1:nbw,jj+off-2) w(:,4) = bcast_buffer(1:nbw,jj+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_avx_avx2_4hv_double(a(1,jj+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_avx_avx2_4hv_double(a(1,jj+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -475,10 +475,10 @@ module compute_hh_trafo_real w(:,1) = bcast_buffer(1:nbw,jjj+off) w(:,2) = bcast_buffer(1:nbw,jjj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,jjj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe), & + call double_hh_trafo_real_avx_avx2_2hv_double(a(1,jjj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo @@ -571,11 +571,11 @@ module compute_hh_trafo_real #ifndef WITH_OPENMP integer(kind=ik), intent(in) :: last_stripe_width ! real(kind=rk4) :: a(stripe_width,a_dim2,stripe_count) - real(kind=rk4), allocatable :: a(:,:,:) + real(kind=rk4), pointer :: a(:,:,:) #else integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width ! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count,max_threads) - real(kind=rk4), allocatable :: a(:,:,:,:) + real(kind=rk4), pointer :: a(:,:,:,:) #endif integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL @@ -809,7 +809,7 @@ module compute_hh_trafo_real !#if defined(WITH_AVX_SANDYBRIDGE) -! call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw) +! call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw) !#endif #ifdef WITH_OPENMP @@ -840,10 +840,10 @@ module compute_hh_trafo_real w(:,3) = bcast_buffer(1:nbw,j+off-2) w(:,4) = bcast_buffer(1:nbw,j+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_sse_4hv_single(a(1,j+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_sse_4hv_single(a(1,j+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -851,18 +851,18 @@ module compute_hh_trafo_real w(:,1) = bcast_buffer(1:nbw,jj+off) w(:,2) = bcast_buffer(1:nbw,jj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_sse_2hv_single(a(1,jj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe), & + call double_hh_trafo_real_sse_2hv_single(a(1,jj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo #ifdef WITH_OPENMP - if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), & + if (jj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #else - if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & + if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) @@ -883,10 +883,10 @@ module compute_hh_trafo_real w(:,4) = bcast_buffer(1:nbw,j+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_avx_avx2_4hv_single(a(1,j+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_avx_avx2_4hv_single(a(1,j+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -895,10 +895,10 @@ module compute_hh_trafo_real w(:,2) = bcast_buffer(1:nbw,jj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_avx_avx2_2hv_single(a(1,jj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe), & + call double_hh_trafo_real_avx_avx2_2hv_single(a(1,jj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo @@ -927,10 +927,10 @@ module compute_hh_trafo_real w(:,5) = bcast_buffer(1:nbw,j+off-4) w(:,6) = bcast_buffer(1:nbw,j+off-5) #ifdef WITH_OPENMP - call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, & + call hexa_hh_trafo_real_sse_6hv_single(a(1,j+off+a_off-5,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe), w, & + call hexa_hh_trafo_real_sse_6hv_single(a(1,j+off+a_off-5,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -940,10 +940,10 @@ module compute_hh_trafo_real w(:,3) = bcast_buffer(1:nbw,jj+off-2) w(:,4) = bcast_buffer(1:nbw,jj+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_sse_4hv_single(a(1,jj+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_sse_4hv_single(a(1,jj+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -951,18 +951,18 @@ module compute_hh_trafo_real w(:,1) = bcast_buffer(1:nbw,jjj+off) w(:,2) = bcast_buffer(1:nbw,jjj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_sse_2hv_single(a(1,jjj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe), & + call double_hh_trafo_real_sse_2hv_single(a(1,jjj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo #ifdef WITH_OPENMP - if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), & + if (jjj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #else - if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & + if (jjj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) #endif #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) @@ -986,10 +986,10 @@ module compute_hh_trafo_real w(:,6) = bcast_buffer(1:nbw,j+off-5) #ifdef WITH_OPENMP - call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, & + call hexa_hh_trafo_real_avx_avx2_6hv_single(a(1,j+off+a_off-5,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe), w, & + call hexa_hh_trafo_real_avx_avx2_6hv_single(a(1,j+off+a_off-5,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -1000,10 +1000,10 @@ module compute_hh_trafo_real w(:,4) = bcast_buffer(1:nbw,jj+off-3) #ifdef WITH_OPENMP - call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, & + call quad_hh_trafo_real_avx_avx2_4hv_single(a(1,jj+off+a_off-3,istripe,my_thread), w, & nbw, nl, stripe_width, nbw) #else - call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, & + call quad_hh_trafo_real_avx_avx2_4hv_single(a(1,jj+off+a_off-3,istripe), w, & nbw, nl, stripe_width, nbw) #endif enddo @@ -1012,10 +1012,10 @@ module compute_hh_trafo_real w(:,2) = bcast_buffer(1:nbw,jjj+off-1) #ifdef WITH_OPENMP - call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), & + call double_hh_trafo_real_avx_avx2_2hv_single(a(1,jjj+off+a_off-1,istripe,my_thread), & w, nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe), & + call double_hh_trafo_real_avx_avx2_2hv_single(a(1,jjj+off+a_off-1,istripe), & w, nbw, nl, stripe_width, nbw) #endif enddo