Commit 777d66f8 authored by Andreas Marek's avatar Andreas Marek

Start to unify complex SSE kernels

parent 4817663a
......@@ -926,7 +926,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
......
......@@ -795,7 +795,7 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
src/elpa2/kernels/complex_avx512_1hv_template.c \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_vsx_4hv_template.c \
......
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include "config-f90.h"
#define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
#define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
#define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
#define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
#define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
#define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
#define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
#define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
#define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
#define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
#define CONCAT2_3ARGS(a, b, c) a ## b ## c
//define instruction set numbers
#define SSE_128 128
#define NEON_ARCH64_128 1285
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == 512
#include <x86intrin.h>
#ifdef BLOCK2
#include <pmmintrin.h>
#endif
#endif
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef BLOCK2
#define PREFIX double
#define BLOCK 2
#endif
#ifdef BLOCK1
#define PREFIX single
#define BLOCK 1
#endif
#if VEC_SET == SSE_128
#define SIMD_SET SSE
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SIMD_DATATYPE __m128d
#define _SIMD_LOAD _mm_load_pd
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE _mm_store_pd
#define _SIMD_MUL _mm_mul_pd
#define _SIMD_ADD _mm_add_pd
#define _SIMD_XOR _mm_xor_pd
#define _SIMD_MADDSUB _mm_maddsub_pd
#define _SIMD_ADDSUB _mm_addsub_pd
#define _SIMD_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m128
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
#define _SIMD_MADDSUB _mm_maddsub_ps
#define _SIMD_ADDSUB _mm_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#endif
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#define WORD_LENGTH double
#define DATA_TYPE double complex
#define DATA_TYPE_PTR double complex*
#define DATA_TYPE_REAL double
#define DATA_TYPE_REAL_PTR double*
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define WORD_LENGTH single
#define DATA_TYPE float complex
#define DATA_TYPE_PTR float complex*
#define DATA_TYPE_REAL float
#define DATA_TYPE_REAL_PTR float*
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#endif /* VEC_SET == SSE_128 */
//Forward declaration
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == SSE_128 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == SSE_128 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine single_hh_trafo_complex_SSE_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_SSE_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine single_hh_trafo_complex_SSE_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_SSE_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
#ifdef BLOCK1
)
#endif
#ifdef BLOCK2
,int* pldh)
#endif
{
int i, worked_on;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
#ifdef BLOCK2
int ldh = *pldh;
DATA_TYPE s = conj(hh[(ldh)+1])*1.0;
for (i = 2; i < nb; i++)
{
s += hh[i-1] * conj(hh[(i+ldh)]);
}
#endif
worked_on = 0;
#ifdef BLOCK1
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#define STEP_SIZE 6
#define UPPER_BOUND 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 8
#endif
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
if (nq == i) {
return;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
#endif /* BLOCK1 */
#ifdef BLOCK2
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#define UPPER_BOUND 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#define STEP_SIZE 8
#define UPPER_BOUND 4
#endif
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on +=ROW_LENGTH;
}
if (nq == i)
{
return;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#define STEP_SIZE 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#endif
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#endif /* BLOCK2 */
#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
abort();
}
#endif
}
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
)
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s)
#endif
{
DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
#ifdef BLOCK2
DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
#endif
__SIMD_DATATYPE x1, x2, x3, x4, x5, x6;
__SIMD_DATATYPE q1, q2, q3, q4, q5, q6;
#ifdef BLOCK2
__SIMD_DATATYPE y1, y2, y3, y4, y5, y6;
__SIMD_DATATYPE h2_real, h2_imag;
#endif
__SIMD_DATATYPE h1_real, h1_imag;
__SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
int i=0;
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif
#endif /* VEC_SET == SSE_128 */
#ifdef BLOCK2
x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
x6 = _SIMD_LOAD(&q_dbl[(2*ldq)+5*offset]);
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
#endif
#endif /* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif
y1 = _SIMD_LOAD(&q_dbl[0]);
y2 = _SIMD_LOAD(&q_dbl[offset]);
y3 = _SIMD_LOAD(&q_dbl[2*offset]);
y4 = _SIMD_LOAD(&q_dbl[3*offset]);
y5 = _SIMD_LOAD(&q_dbl[4*offset]);
y6 = _SIMD_LOAD(&q_dbl[5*offset]);
tmp1 = _SIMD_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, x5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_ADD(y5, _mm_msubadd_pd(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, x6);
#ifdef __ELPA_USE_FMA__
y6 = _SIMD_ADD(y6, _mm_msubadd_pd(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
#endif /* BLOCK2 */
#ifdef BLOCK1
x1 = _SIMD_LOAD(&q_dbl[0]);
x2 = _SIMD_LOAD(&q_dbl[offset]);
x3 = _SIMD_LOAD(&q_dbl[2*offset]);
x4 = _SIMD_LOAD(&q_dbl[3*offset]);
x5 = _SIMD_LOAD(&q_dbl[4*offset]);
x6 = _SIMD_LOAD(&q_dbl[5*offset]);
#endif
for (i = BLOCK; i < nb; i++)
{
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
#endif
#endif /* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _SIMD_XOR(h1_imag, sign);
#endif
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
tmp1 = _SIMD_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _SIMD_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h1_imag, q2);
#ifdef __ELPA_USE_FMA__
x2 = _SIMD_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h1_imag, q3);
#ifdef __ELPA_USE_FMA__
x3 = _SIMD_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h1_imag, q4);
#ifdef __ELPA_USE_FMA__
x4 = _SIMD_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h1_imag, q5);
#ifdef __ELPA_USE_FMA__
x5 = _SIMD_ADD(x5, _mm_msubadd_pd(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h1_imag, q6);
#ifdef __ELPA_USE_FMA__
x6 = _SIMD_ADD(x6, _mm_msubadd_pd(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
#ifdef BLOCK2
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
#endif
#endif /* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif
tmp1 = _SIMD_MUL(h2_imag, q1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, q2);
#ifdef __ELPA_USE_FMA__