complex_sse_2hv_template.c 59.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
66
#include <pmmintrin.h>
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158

#define __forceinline __attribute__((always_inline))

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_LOADU _mm_loadu_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_STOREU _mm_storeu_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_XOR _mm_xor_pd
#define _SSE_ADDSUB _mm_addsub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#endif

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_LOADU _mm_loadu_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_STOREU _mm_storeu_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_XOR _mm_xor_ps
#define _SSE_ADDSUB _mm_addsub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#endif



#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#if 0
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#endif
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value                :: q
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
void double_hh_trafo_complex_sse_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
159 160 161 162 163
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
164
#ifdef DOUBLE_PRECISION_COMPLEX
165
        double complex s = conj(hh[(ldh)+1])*1.0;
166 167
#endif
#ifdef SINGLE_PRECISION_COMPLEX
168
        float complex s = conj(hh[(ldh)+1])*1.0f;
169
#endif
170 171 172 173
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * conj(hh[(i+ldh)]);
        }
174

175 176
        for (i = 0; i < nq; i+=4)
        {
177
#ifdef DOUBLE_PRECISION_COMPLEX
178
                hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
179 180
#endif
#ifdef SINGLE_PRECISION_COMPLEX
181
                hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
182
#endif
183
        }
184 185 186 187 188 189 190 191 192
}
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
193 194 195
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
196 197
#endif
#ifdef SINGLE_PRECISION_COMPLEX
198 199 200
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
201
#endif
202 203 204 205 206 207
        __SSE_DATATYPE x1, x2, x3, x4;
        __SSE_DATATYPE y1, y2, y3, y4;
        __SSE_DATATYPE q1, q2, q3, q4;
        __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
208 209

#ifdef DOUBLE_PRECISION_COMPLEX
210
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
211 212
#endif
#ifdef SINGLE_PRECISION_COMPLEX
213
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
214 215
#endif

216 217
        x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
        x2 = _SSE_LOAD(&q_dbl[(2*ldq)+offset]);
218
#ifdef DOUBLE_PRECISION_COMPLEX
219 220
        x3 = _SSE_LOAD(&q_dbl[(2*ldq)+2*offset]);
        x4 = _SSE_LOAD(&q_dbl[(2*ldq)+3*offset]);
221 222 223
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
224 225
        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
226 227
#endif
#ifdef SINGLE_PRECISION_COMPLEX
228 229
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
230 231 232
#endif

#ifndef __ELPA_USE_FMA__
233 234
        // conjugate
        h2_imag = _SSE_XOR(h2_imag, sign);
235 236
#endif

237 238
        y1 = _SSE_LOAD(&q_dbl[0]);
        y2 = _SSE_LOAD(&q_dbl[offset]);
239
#ifdef DOUBLE_PRECISION_COMPLEX
240 241
        y3 = _SSE_LOAD(&q_dbl[2*offset]);
        y4 = _SSE_LOAD(&q_dbl[3*offset]);
242 243
#endif

244
        tmp1 = _SSE_MUL(h2_imag, x1);
245
#ifdef __ELPA_USE_FMA__
246
        y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
247
#else
248
        y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
249
#endif
250
        tmp2 = _SSE_MUL(h2_imag, x2);
251
#ifdef __ELPA_USE_FMA__
252
        y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
253
#else
254
        y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
255 256 257
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
258
        tmp3 = _SSE_MUL(h2_imag, x3);
259
#ifdef __ELPA_USE_FMA__
260
        y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
261
#else
262
        y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
263
#endif
264
        tmp4 = _SSE_MUL(h2_imag, x4);
265
#ifdef __ELPA_USE_FMA__
266
        y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
267
#else
268
        y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
269 270 271
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

272 273 274 275
        for (i = 2; i < nb; i++)
        {
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
276
#ifdef DOUBLE_PRECISION_COMPLEX
277 278
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
279 280 281
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
282 283
                h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
284 285
#endif
#ifdef SINGLE_PRECISION_COMPLEX
286 287
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
288 289 290
#endif

#ifndef __ELPA_USE_FMA__
291 292
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
293 294
#endif

295
                tmp1 = _SSE_MUL(h1_imag, q1);
296
#ifdef __ELPA_USE_FMA__
297
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
298
#else
299
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
300
#endif
301
                tmp2 = _SSE_MUL(h1_imag, q2);
302
#ifdef __ELPA_USE_FMA__
303
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
304
#else
305
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
306 307 308
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
309
                tmp3 = _SSE_MUL(h1_imag, q3);
310
#ifdef __ELPA_USE_FMA__
311
                x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
312
#else
313
                x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
314
#endif
315
                tmp4 = _SSE_MUL(h1_imag, q4);
316
#ifdef __ELPA_USE_FMA__
317
                x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
318
#else
319
                x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
320 321 322 323
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
324 325
                h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
                h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
326 327
#endif
#ifdef SINGLE_PRECISION_COMPLEX
328 329
                h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
                h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
330 331 332
#endif

#ifndef __ELPA_USE_FMA__
333 334
                // conjugate
                h2_imag = _SSE_XOR(h2_imag, sign);
335 336
#endif

337
                tmp1 = _SSE_MUL(h2_imag, q1);
338
#ifdef __ELPA_USE_FMA__
339
                y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
340
#else
341
                y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
342
#endif
343
                tmp2 = _SSE_MUL(h2_imag, q2);
344
#ifdef __ELPA_USE_FMA__
345
                y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
346
#else
347
                y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
348 349 350
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
351
                tmp3 = _SSE_MUL(h2_imag, q3);
352
#ifdef __ELPA_USE_FMA__
353
                y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
354
#else
355
                y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
356
#endif
357
                tmp4 = _SSE_MUL(h2_imag, q4);
358
#ifdef __ELPA_USE_FMA__
359
                y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
360
#else
361
                y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
362 363
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
364
        }
365 366

#ifdef DOUBLE_PRECISION_COMPLEX
367 368
        h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
369 370
#endif
#ifdef SINGLE_PRECISION_COMPLEX
371 372
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
373 374 375
#endif

#ifndef __ELPA_USE_FMA__
376 377
        // conjugate
        h1_imag = _SSE_XOR(h1_imag, sign);
378 379
#endif

380 381
        q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
382
#ifdef DOUBLE_PRECISION_COMPLEX
383 384
        q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
385 386
#endif

387
        tmp1 = _SSE_MUL(h1_imag, q1);
388
#ifdef __ELPA_USE_FMA__
389
        x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
390
#else
391
        x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
392
#endif
393
        tmp2 = _SSE_MUL(h1_imag, q2);
394
#ifdef __ELPA_USE_FMA__
395
        x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
396
#else
397
        x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
398 399 400
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
401
        tmp3 = _SSE_MUL(h1_imag, q3);
402
#ifdef __ELPA_USE_FMA__
403
        x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
404
#else
405
        x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
406
#endif
407
        tmp4 = _SSE_MUL(h1_imag, q4);
408
#ifdef __ELPA_USE_FMA__
409
        x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
410
#else
411
        x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
412 413 414 415
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
416 417
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
418 419
#endif
#ifdef SINGLE_PRECISION_COMPLEX
420 421
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
422 423
#endif

424 425
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
426

427
        tmp1 = _SSE_MUL(h1_imag, x1);
428 429

#ifdef __ELPA_USE_FMA__
430
        x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
431
#else
432
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
433
#endif
434
        tmp2 = _SSE_MUL(h1_imag, x2);
435
#ifdef __ELPA_USE_FMA__
436
        x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
437
#else
438
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
439 440 441
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
442
        tmp3 = _SSE_MUL(h1_imag, x3);
443
#ifdef __ELPA_USE_FMA__
444
        x3 = _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
445
#else
446
        x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
447
#endif
448
        tmp4 = _SSE_MUL(h1_imag, x4);
449
#ifdef __ELPA_USE_FMA__
450
        x4 = _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
451
#else
452
        x4 = _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
453 454 455 456
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
457 458
        h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
459 460
#endif
#ifdef SINGLE_PRECISION_COMPLEX
461 462
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
463 464 465
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
466 467
        h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
468 469
#endif
#ifdef SINGLE_PRECISION_COMPLEX
470 471
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
472 473
#endif

474 475 476 477
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
        h2_real = _SSE_XOR(h2_real, sign);
        h2_imag = _SSE_XOR(h2_imag, sign);
478

479 480 481 482 483 484
#ifdef SINGLE_PRECISION_COMPLEX
        tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
#else
        tmp2 = _SSE_LOADU(s_dbl);
#endif
        tmp1 = _SSE_MUL(h2_imag, tmp2);
485 486

#ifdef __ELPA_USE_FMA__
487
        tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
488
#else
489
        tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
490 491 492
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
493 494
        h2_real = _mm_movedup_pd(tmp2);
        h2_imag = _mm_set1_pd(tmp2[1]);
495 496
#endif
#ifdef SINGLE_PRECISION_COMPLEX
497 498
        h2_real = _mm_moveldup_ps(tmp2);
        h2_imag = _mm_movehdup_ps(tmp2);
499 500
#endif

501
        tmp1 = _SSE_MUL(h1_imag, y1);
502
#ifdef __ELPA_USE_FMA__
503
        y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
504
#else
505
        y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
506
#endif
507
        tmp2 = _SSE_MUL(h1_imag, y2);
508
#ifdef __ELPA_USE_FMA__
509
        y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
510
#else
511
        y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
512 513 514
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
515
        tmp3 = _SSE_MUL(h1_imag, y3);
516
#ifdef __ELPA_USE_FMA__
517
        y3 = _mm_maddsub_pd(h1_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
518
#else
519
        y3 = _SSE_ADDSUB( _SSE_MUL(h1_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
520
#endif
521
        tmp4 = _SSE_MUL(h1_imag, y4);
522
#ifdef __ELPA_USE_FMA__
523
        y4 = _mm_maddsub_pd(h1_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
524
#else
525
        y4 = _SSE_ADDSUB( _SSE_MUL(h1_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
526 527 528
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

529
        tmp1 = _SSE_MUL(h2_imag, x1);
530
#ifdef __ELPA_USE_FMA__
531
        y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
532
#else
533
        y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
534
#endif
535
        tmp2 = _SSE_MUL(h2_imag, x2);
536
#ifdef __ELPA_USE_FMA__
537
        y2 = _SSE_ADD(y2, _mm_maddsub_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
538
#else
539
        y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
540 541 542
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
543
        tmp3 = _SSE_MUL(h2_imag, x3);
544
#ifdef __ELPA_USE_FMA__
545
        y3 = _SSE_ADD(y3, _mm_maddsub_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
546
#else
547
        y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
548
#endif
549
        tmp4 = _SSE_MUL(h2_imag, x4);
550
#ifdef __ELPA_USE_FMA__
551
        y4 = _SSE_ADD(y4, _mm_maddsub_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
552
#else
553
        y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
554 555 556
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

557 558
        q1 = _SSE_LOAD(&q_dbl[0]);
        q2 = _SSE_LOAD(&q_dbl[offset]);
559
#ifdef DOUBLE_PRECISION_COMPLEX
560 561
        q3 = _SSE_LOAD(&q_dbl[2*offset]);
        q4 = _SSE_LOAD(&q_dbl[3*offset]);
562
#endif
563 564
        q1 = _SSE_ADD(q1, y1);
        q2 = _SSE_ADD(q2, y2);
565
#ifdef DOUBLE_PRECISION_COMPLEX
566 567
        q3 = _SSE_ADD(q3, y3);
        q4 = _SSE_ADD(q4, y4);
568
#endif
569 570
        _SSE_STORE(&q_dbl[0], q1);
        _SSE_STORE(&q_dbl[offset], q2);
571
#ifdef DOUBLE_PRECISION_COMPLEX
572 573
        _SSE_STORE(&q_dbl[2*offset], q3);
        _SSE_STORE(&q_dbl[3*offset], q4);
574 575 576
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
577 578
        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
579 580
#endif
#ifdef SINGLE_PRECISION_COMPLEX
581 582
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
583 584
#endif

585 586
        q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _SSE_LOAD(&q_dbl[(ldq*2)+offset]);
587
#ifdef DOUBLE_PRECISION_COMPLEX
588 589
        q3 = _SSE_LOAD(&q_dbl[(ldq*2)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(ldq*2)+3*offset]);
590
#endif
591 592
        q1 = _SSE_ADD(q1, x1);
        q2 = _SSE_ADD(q2, x2);
593
#ifdef DOUBLE_PRECISION_COMPLEX
594 595
        q3 = _SSE_ADD(q3, x3);
        q4 = _SSE_ADD(q4, x4);
596
#endif
597
        tmp1 = _SSE_MUL(h2_imag, y1);
598 599

#ifdef __ELPA_USE_FMA__
600
        q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
601
#else
602
        q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
603
#endif
604
        tmp2 = _SSE_MUL(h2_imag, y2);
605
#ifdef __ELPA_USE_FMA__
606
        q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
607
#else
608
        q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
609 610 611
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
612
        tmp3 = _SSE_MUL(h2_imag, y3);
613
#ifdef __ELPA_USE_FMA__
614
        q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
615
#else
616
        q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
617
#endif
618
        tmp4 = _SSE_MUL(h2_imag, y4);
619
#ifdef __ELPA_USE_FMA__
620
        q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
621
#else
622
        q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
623 624 625
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

626 627
        _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
        _SSE_STORE(&q_dbl[(ldq*2)+offset], q2);
628
#ifdef DOUBLE_PRECISION_COMPLEX
629 630
        _SSE_STORE(&q_dbl[(ldq*2)+2*offset], q3);
        _SSE_STORE(&q_dbl[(ldq*2)+3*offset], q4);
631 632
#endif

633 634 635 636
        for (i = 2; i < nb; i++)
        {
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
637
#ifdef DOUBLE_PRECISION_COMPLEX
638 639
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
640 641
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
642 643
                h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
644 645
#endif
#ifdef SINGLE_PRECISION_COMPLEX
646 647
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
648 649
#endif

650
                tmp1 = _SSE_MUL(h1_imag, x1);
651
#ifdef __ELPA_USE_FMA__
652
                q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
653
#else
654
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
655
#endif
656
                tmp2 = _SSE_MUL(h1_imag, x2);
657
#ifdef __ELPA_USE_FMA__
658
                q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
659
#else
660
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
661 662 663
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
664
                tmp3 = _SSE_MUL(h1_imag, x3);
665
#ifdef __ELPA_USE_FMA__
666
                q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
667
#else
668
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
669
#endif
670
                tmp4 = _SSE_MUL(h1_imag, x4);
671
#ifdef __ELPA_USE_FMA__
672
                q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
673
#else
674
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
675 676 677 678
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
679 680
                h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
                h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
681 682
#endif
#ifdef SINGLE_PRECISION_COMPLEX
683 684
                h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
                h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
685 686
#endif

687
                tmp1 = _SSE_MUL(h2_imag, y1);
688
#ifdef __ELPA_USE_FMA__
689
                q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
690
#else
691
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
692
#endif
693
                tmp2 = _SSE_MUL(h2_imag, y2);
694
#ifdef __ELPA_USE_FMA__
695
                q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
696
#else
697
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
698 699 700
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
701
                tmp3 = _SSE_MUL(h2_imag, y3);
702
#ifdef __ELPA_USE_FMA__
703
                q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
704
#else
705
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
706
#endif
707
                tmp4 = _SSE_MUL(h2_imag, y4);
708
#ifdef __ELPA_USE_FMA__
709
                q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
710
#else
711
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
712 713 714
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

715 716
                _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
717
#ifdef DOUBLE_PRECISION_COMPLEX
718 719
                _SSE_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _SSE_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
720
#endif
721
        }
722 723

#ifdef DOUBLE_PRECISION_COMPLEX
724 725
        h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
726 727
#endif
#ifdef SINGLE_PRECISION_COMPLEX
728 729
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
730 731 732
#endif


733 734
        q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
735
#ifdef DOUBLE_PRECISION_COMPLEX
736 737
        q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
738 739
#endif

740
        tmp1 = _SSE_MUL(h1_imag, x1);
741
#ifdef __ELPA_USE_FMA__
742
        q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
743
#else
744
        q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
745
#endif
746
        tmp2 = _SSE_MUL(h1_imag, x2);
747
#ifdef __ELPA_USE_FMA__
748
        q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
749
#else
750
        q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
751 752 753
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
754
        tmp3 = _SSE_MUL(h1_imag, x3);
755
#ifdef __ELPA_USE_FMA__
756
        q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
757
#else
758
        q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
759
#endif
760
        tmp4 = _SSE_MUL(h1_imag, x4);
761
#ifdef __ELPA_USE_FMA__
762
        q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
763
#else
764
        q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
765 766 767 768
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */


769 770
        _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
        _SSE_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
771
#ifdef DOUBLE_PRECISION_COMPLEX
772 773
        _SSE_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
        _SSE_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
774 775 776 777 778 779 780
#endif
}

#if 0

static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
781 782 783
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
784

785 786 787 788 789 790
        __SSE_DATATYPE x1, x2, x3;
        __SSE_DATATYPE y1, y2, y3;
        __SSE_DATATYPE q1, q2, q3;
        __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3;
        int i=0;
791

792
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
793

794 795 796
        x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
        x2 = _SSE_LOAD(&q_dbl[(2*ldq)+2]);
        x3 = _SSE_LOAD(&q_dbl[(2*ldq)+4]);
797

798 799
        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
800
#ifndef __ELPA_USE_FMA__
801 802
        // conjugate
        h2_imag = _SSE_XOR(h2_imag, sign);
803 804
#endif

805 806 807
        y1 = _SSE_LOAD(&q_dbl[0]);
        y2 = _SSE_LOAD(&q_dbl[2]);
        y3 = _SSE_LOAD(&q_dbl[4]);
808

809
        tmp1 = _SSE_MUL(h2_imag, x1);
810
#ifdef __ELPA_USE_FMA__
811
        y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
812
#else
813
        y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
814
#endif
815
        tmp2 = _SSE_MUL(h2_imag, x2);
816
#ifdef __ELPA_USE_FMA__
817
        y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
818
#else
819
        y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
820
#endif
821
        tmp3 = _SSE_MUL(h2_imag, x3);
822
#ifdef __ELPA_USE_FMA__
823
        y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
824
#else
825
        y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
826 827
#endif

828 829 830 831 832
        for (i = 2; i < nb; i++)
        {
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2]);
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+4]);
833

834 835
                h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
836
#ifndef __ELPA_USE_FMA__
837 838
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
839 840
#endif

841
                tmp1 = _SSE_MUL(h1_imag, q1);
842
#ifdef __ELPA_USE_FMA__
843
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
844
#else
845
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
846
#endif
847
                tmp2 = _SSE_MUL(h1_imag, q2);
848
#ifdef __ELPA_USE_FMA__
849
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
850
#else
851
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
852
#endif
853
                tmp3 = _SSE_MUL(h1_imag, q3);
854
#ifdef __ELPA_USE_FMA__
855
                x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
856
#else
857
                x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
858 859
#endif

860 861
                h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
                h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
862
#ifndef __ELPA_USE_FMA__
863 864
                // conjugate
                h2_imag = _SSE_XOR(h2_imag, sign);
865 866
#endif

867
                tmp1 = _SSE_MUL(h2_imag, q1);
868
#ifdef __ELPA_USE_FMA__
869
                y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
870
#else
871
                y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
872
#endif
873
                tmp2 = _SSE_MUL(h2_imag, q2);
874
#ifdef __ELPA_USE_FMA__
875
                y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
876
#else
877
                y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
878
#endif
879
                tmp3 = _SSE_MUL(h2_imag, q3);
880
#ifdef __ELPA_USE_FMA__
881
                y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
882
#else
883
                y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
884
#endif
885
        }
886

887 888
        h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
889
#ifndef __ELPA_USE_FMA__
890 891
        // conjugate
        h1_imag = _SSE_XOR(h1_imag, sign);
892 893
#endif

894 895 896
        q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2]);
        q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+4]);
897

898
        tmp1 = _SSE_MUL(h1_imag, q1);
899
#ifdef __ELPA_USE_FMA__
900
        x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
901
#else
902
        x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
903
#endif
904
        tmp2 = _SSE_MUL(h1_imag, q2);
905
#ifdef __ELPA_USE_FMA__
906
        x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
907
#else
908
        x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
909
#endif
910
        tmp3 = _SSE_MUL(h1_imag, q3);
911
#ifdef __ELPA_USE_FMA__
912
        x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
913
#else
914
        x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
915 916
#endif

917 918 919 920
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
921

922
        tmp1 = _SSE_MUL(h1_imag, x1);
923
#ifdef __ELPA_USE_FMA__
924
        x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
925
#else
926
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
927
#endif
928
        tmp2 = _SSE_MUL(h1_imag, x2);
929
#ifdef __ELPA_USE_FMA__
930
        x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
931
#else
932
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
933
#endif
934
        tmp3 = _SSE_MUL(h1_imag, x3);
935
#ifdef __ELPA_USE_FMA__
936
        x3 = _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
937
#else
938
        x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
939 940
#endif

941 942 943 944
        h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
        h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
945

946 947 948 949
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
        h2_real = _SSE_XOR(h2_real, sign);
        h2_imag = _SSE_XOR(h2_imag, sign);
950

951 952
        tmp2 = _SSE_LOADU(s_dbl);
        tmp1 = _SSE_MUL(h2_imag, tmp2);
953
#ifdef __ELPA_USE_FMA__
954
        tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
955
#else
956
        tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
957
#endif
958
        _SSE_STOREU(s_dbl, tmp2);
959 960 961
        h2_real = _mm_set1_pd(s_dbl[0]);
        h2_imag = _mm_set1_pd(s_dbl[1]);

962 963
        // h2_real = _mm_loaddup_pd(&s_dbl[0]);
        // h2_imag = _mm_loaddup_pd(&s_dbl[1]);
964

965
        tmp1 = _SSE_MUL(h1_imag, y1);
966
#ifdef __ELPA_USE_FMA__
967
        y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
968
#else
969
        y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
970
#endif
971
        tmp2 = _SSE_MUL(h1_imag, y2);
972
#ifdef __ELPA_USE_FMA__
973
        y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
974
#else
975
        y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
976
#endif
977
        tmp3 = _SSE_MUL(h1_imag, y3);
978
#ifdef __ELPA_USE_FMA__
979
        y3 = _mm_maddsub_pd(h1_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
980
#else
981
        y3 = _SSE_ADDSUB( _SSE_MUL(h1_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
982 983
#endif

984
        tmp1 = _SSE_MUL(h2_imag, x1);