complex_sse_2hv_template.c 59.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
66
#include <pmmintrin.h>
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

#define __forceinline __attribute__((always_inline))

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_LOADU _mm_loadu_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_STOREU _mm_storeu_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_XOR _mm_xor_pd
#define _SSE_ADDSUB _mm_addsub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#endif

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_LOADU _mm_loadu_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_STOREU _mm_storeu_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_XOR _mm_xor_ps
#define _SSE_ADDSUB _mm_addsub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#endif



#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#if 0
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#endif
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value                :: q
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
void double_hh_trafo_complex_sse_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
159
160
161
162
163
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
164
#ifdef DOUBLE_PRECISION_COMPLEX
165
        double complex s = conj(hh[(ldh)+1])*1.0;
166
167
#endif
#ifdef SINGLE_PRECISION_COMPLEX
168
        float complex s = conj(hh[(ldh)+1])*1.0f;
169
#endif
170
171
172
173
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * conj(hh[(i+ldh)]);
        }
174

175
176
        for (i = 0; i < nq; i+=4)
        {
177
#ifdef DOUBLE_PRECISION_COMPLEX
178
                hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
179
180
#endif
#ifdef SINGLE_PRECISION_COMPLEX
181
                hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
182
#endif
183
        }
184
185
186
187
188
189
190
191
192
}
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
193
194
195
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
196
197
#endif
#ifdef SINGLE_PRECISION_COMPLEX
198
199
200
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
201
#endif
202
203
204
205
206
207
        __SSE_DATATYPE x1, x2, x3, x4;
        __SSE_DATATYPE y1, y2, y3, y4;
        __SSE_DATATYPE q1, q2, q3, q4;
        __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
208
209

#ifdef DOUBLE_PRECISION_COMPLEX
210
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
211
212
#endif
#ifdef SINGLE_PRECISION_COMPLEX
213
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
214
215
#endif

216
217
        x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
        x2 = _SSE_LOAD(&q_dbl[(2*ldq)+offset]);
218
#ifdef DOUBLE_PRECISION_COMPLEX
219
220
        x3 = _SSE_LOAD(&q_dbl[(2*ldq)+2*offset]);
        x4 = _SSE_LOAD(&q_dbl[(2*ldq)+3*offset]);
221
222
223
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
224
225
        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
226
227
#endif
#ifdef SINGLE_PRECISION_COMPLEX
228
229
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
230
231
232
#endif

#ifndef __ELPA_USE_FMA__
233
234
        // conjugate
        h2_imag = _SSE_XOR(h2_imag, sign);
235
236
#endif

237
238
        y1 = _SSE_LOAD(&q_dbl[0]);
        y2 = _SSE_LOAD(&q_dbl[offset]);
239
#ifdef DOUBLE_PRECISION_COMPLEX
240
241
        y3 = _SSE_LOAD(&q_dbl[2*offset]);
        y4 = _SSE_LOAD(&q_dbl[3*offset]);
242
243
#endif

244
        tmp1 = _SSE_MUL(h2_imag, x1);
245
#ifdef __ELPA_USE_FMA__
246
        y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
247
#else
248
        y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
249
#endif
250
        tmp2 = _SSE_MUL(h2_imag, x2);
251
#ifdef __ELPA_USE_FMA__
252
        y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
253
#else
254
        y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
255
256
257
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
258
        tmp3 = _SSE_MUL(h2_imag, x3);
259
#ifdef __ELPA_USE_FMA__
260
        y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
261
#else
262
        y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
263
#endif
264
        tmp4 = _SSE_MUL(h2_imag, x4);
265
#ifdef __ELPA_USE_FMA__
266
        y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
267
#else
268
        y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
269
270
271
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

272
273
274
275
        for (i = 2; i < nb; i++)
        {
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
276
#ifdef DOUBLE_PRECISION_COMPLEX
277
278
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
279
280
281
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
282
283
                h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
284
285
#endif
#ifdef SINGLE_PRECISION_COMPLEX
286
287
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
288
289
290
#endif

#ifndef __ELPA_USE_FMA__
291
292
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
293
294
#endif

295
                tmp1 = _SSE_MUL(h1_imag, q1);
296
#ifdef __ELPA_USE_FMA__
297
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
298
#else
299
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
300
#endif
301
                tmp2 = _SSE_MUL(h1_imag, q2);
302
#ifdef __ELPA_USE_FMA__
303
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
304
#else
305
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
306
307
308
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
309
                tmp3 = _SSE_MUL(h1_imag, q3);
310
#ifdef __ELPA_USE_FMA__
311
                x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
312
#else
313
                x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
314
#endif
315
                tmp4 = _SSE_MUL(h1_imag, q4);
316
#ifdef __ELPA_USE_FMA__
317
                x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
318
#else
319
                x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
320
321
322
323
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
324
325
                h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
                h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
326
327
#endif
#ifdef SINGLE_PRECISION_COMPLEX
328
329
                h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
                h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
330
331
332
#endif

#ifndef __ELPA_USE_FMA__
333
334
                // conjugate
                h2_imag = _SSE_XOR(h2_imag, sign);
335
336
#endif

337
                tmp1 = _SSE_MUL(h2_imag, q1);
338
#ifdef __ELPA_USE_FMA__
339
                y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
340
#else
341
                y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
342
#endif
343
                tmp2 = _SSE_MUL(h2_imag, q2);
344
#ifdef __ELPA_USE_FMA__
345
                y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
346
#else
347
                y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
348
349
350
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
351
                tmp3 = _SSE_MUL(h2_imag, q3);
352
#ifdef __ELPA_USE_FMA__
353
                y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
354
#else
355
                y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
356
#endif
357
                tmp4 = _SSE_MUL(h2_imag, q4);
358
#ifdef __ELPA_USE_FMA__
359
                y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
360
#else
361
                y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
362
363
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
364
        }
365
366

#ifdef DOUBLE_PRECISION_COMPLEX
367
368
        h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
369
370
#endif
#ifdef SINGLE_PRECISION_COMPLEX
371
372
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
373
374
375
#endif

#ifndef __ELPA_USE_FMA__
376
377
        // conjugate
        h1_imag = _SSE_XOR(h1_imag, sign);
378
379
#endif

380
381
        q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
382
#ifdef DOUBLE_PRECISION_COMPLEX
383
384
        q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
385
386
#endif

387
        tmp1 = _SSE_MUL(h1_imag, q1);
388
#ifdef __ELPA_USE_FMA__
389
        x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
390
#else
391
        x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
392
#endif
393
        tmp2 = _SSE_MUL(h1_imag, q2);
394
#ifdef __ELPA_USE_FMA__
395
        x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
396
#else
397
        x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
398
399
400
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
401
        tmp3 = _SSE_MUL(h1_imag, q3);
402
#ifdef __ELPA_USE_FMA__
403
        x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
404
#else
405
        x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
406
#endif
407
        tmp4 = _SSE_MUL(h1_imag, q4);
408
#ifdef __ELPA_USE_FMA__
409
        x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
410
#else
411
        x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
412
413
414
415
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
416
417
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
418
419
#endif
#ifdef SINGLE_PRECISION_COMPLEX
420
421
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
422
423
#endif

424
425
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
426

427
        tmp1 = _SSE_MUL(h1_imag, x1);
428
429

#ifdef __ELPA_USE_FMA__
430
        x1 = _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
431
#else
432
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
433
#endif
434
        tmp2 = _SSE_MUL(h1_imag, x2);
435
#ifdef __ELPA_USE_FMA__
436
        x2 = _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
437
#else
438
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
439
440
441
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
442
        tmp3 = _SSE_MUL(h1_imag, x3);
443
#ifdef __ELPA_USE_FMA__
444
        x3 = _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
445
#else
446
        x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
447
#endif
448
        tmp4 = _SSE_MUL(h1_imag, x4);
449
#ifdef __ELPA_USE_FMA__
450
        x4 = _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
451
#else
452
        x4 = _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
453
454
455
456
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
457
458
        h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
459
460
#endif
#ifdef SINGLE_PRECISION_COMPLEX
461
462
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
463
464
465
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
466
467
        h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
468
469
#endif
#ifdef SINGLE_PRECISION_COMPLEX
470
471
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
472
473
#endif

474
475
476
477
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
        h2_real = _SSE_XOR(h2_real, sign);
        h2_imag = _SSE_XOR(h2_imag, sign);
478

479
480
481
482
483
484
#ifdef SINGLE_PRECISION_COMPLEX
        tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
#else
        tmp2 = _SSE_LOADU(s_dbl);
#endif
        tmp1 = _SSE_MUL(h2_imag, tmp2);
485
486

#ifdef __ELPA_USE_FMA__
487
        tmp2 = _mm_maddsub_pd(h2_real, tmp2, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
488
#else
489
        tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
490
491
492
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
493
494
        h2_real = _mm_movedup_pd(tmp2);
        h2_imag = _mm_set1_pd(tmp2[1]);
495
496
#endif
#ifdef SINGLE_PRECISION_COMPLEX
497
498
        h2_real = _mm_moveldup_ps(tmp2);
        h2_imag = _mm_movehdup_ps(tmp2);
499
500
#endif

501
        tmp1 = _SSE_MUL(h1_imag, y1);
502
#ifdef __ELPA_USE_FMA__
503
        y1 = _mm_maddsub_pd(h1_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
504
#else
505
        y1 = _SSE_ADDSUB( _SSE_MUL(h1_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
506
#endif
507
        tmp2 = _SSE_MUL(h1_imag, y2);
508
#ifdef __ELPA_USE_FMA__
509
        y2 = _mm_maddsub_pd(h1_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
510
#else
511
        y2 = _SSE_ADDSUB( _SSE_MUL(h1_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
512
513
514
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
515
        tmp3 = _SSE_MUL(h1_imag, y3);
516
#ifdef __ELPA_USE_FMA__
517
        y3 = _mm_maddsub_pd(h1_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
518
#else
519
        y3 = _SSE_ADDSUB( _SSE_MUL(h1_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
520
#endif
521
        tmp4 = _SSE_MUL(h1_imag, y4);
522
#ifdef __ELPA_USE_FMA__
523
        y4 = _mm_maddsub_pd(h1_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
524
#else
525
        y4 = _SSE_ADDSUB( _SSE_MUL(h1_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
526
527
528
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

529
        tmp1 = _SSE_MUL(h2_imag, x1);
530
#ifdef __ELPA_USE_FMA__
531
        y1 = _SSE_ADD(y1, _mm_maddsub_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
532
#else
533
        y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
534
#endif
535
        tmp2 = _SSE_MUL(h2_imag, x2);
536
#ifdef __ELPA_USE_FMA__
537
        y2 = _SSE_ADD(y2, _mm_maddsub_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
538
#else
539
        y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
540
541
542
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
543
        tmp3 = _SSE_MUL(h2_imag, x3);
544
#ifdef __ELPA_USE_FMA__
545
        y3 = _SSE_ADD(y3, _mm_maddsub_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
546
#else
547
        y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
548
#endif
549
        tmp4 = _SSE_MUL(h2_imag, x4);
550
#ifdef __ELPA_USE_FMA__
551
        y4 = _SSE_ADD(y4, _mm_maddsub_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
552
#else
553
        y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
554
555
556
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

557
558
        q1 = _SSE_LOAD(&q_dbl[0]);
        q2 = _SSE_LOAD(&q_dbl[offset]);
559
#ifdef DOUBLE_PRECISION_COMPLEX
560
561
        q3 = _SSE_LOAD(&q_dbl[2*offset]);
        q4 = _SSE_LOAD(&q_dbl[3*offset]);
562
#endif
563
564
        q1 = _SSE_ADD(q1, y1);
        q2 = _SSE_ADD(q2, y2);
565
#ifdef DOUBLE_PRECISION_COMPLEX
566
567
        q3 = _SSE_ADD(q3, y3);
        q4 = _SSE_ADD(q4, y4);
568
#endif
569
570
        _SSE_STORE(&q_dbl[0], q1);
        _SSE_STORE(&q_dbl[offset], q2);
571
#ifdef DOUBLE_PRECISION_COMPLEX
572
573
        _SSE_STORE(&q_dbl[2*offset], q3);
        _SSE_STORE(&q_dbl[3*offset], q4);
574
575
576
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
577
578
        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
579
580
#endif
#ifdef SINGLE_PRECISION_COMPLEX
581
582
        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
583
584
#endif

585
586
        q1 = _SSE_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _SSE_LOAD(&q_dbl[(ldq*2)+offset]);
587
#ifdef DOUBLE_PRECISION_COMPLEX
588
589
        q3 = _SSE_LOAD(&q_dbl[(ldq*2)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(ldq*2)+3*offset]);
590
#endif
591
592
        q1 = _SSE_ADD(q1, x1);
        q2 = _SSE_ADD(q2, x2);
593
#ifdef DOUBLE_PRECISION_COMPLEX
594
595
        q3 = _SSE_ADD(q3, x3);
        q4 = _SSE_ADD(q4, x4);
596
#endif
597
        tmp1 = _SSE_MUL(h2_imag, y1);
598
599

#ifdef __ELPA_USE_FMA__
600
        q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
601
#else
602
        q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
603
#endif
604
        tmp2 = _SSE_MUL(h2_imag, y2);
605
#ifdef __ELPA_USE_FMA__
606
        q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
607
#else
608
        q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
609
610
611
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
612
        tmp3 = _SSE_MUL(h2_imag, y3);
613
#ifdef __ELPA_USE_FMA__
614
        q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
615
#else
616
        q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
617
#endif
618
        tmp4 = _SSE_MUL(h2_imag, y4);
619
#ifdef __ELPA_USE_FMA__
620
        q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
621
#else
622
        q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
623
624
625
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

626
627
        _SSE_STORE(&q_dbl[(ldq*2)+0], q1);
        _SSE_STORE(&q_dbl[(ldq*2)+offset], q2);
628
#ifdef DOUBLE_PRECISION_COMPLEX
629
630
        _SSE_STORE(&q_dbl[(ldq*2)+2*offset], q3);
        _SSE_STORE(&q_dbl[(ldq*2)+3*offset], q4);
631
632
#endif

633
634
635
636
        for (i = 2; i < nb; i++)
        {
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
637
#ifdef DOUBLE_PRECISION_COMPLEX
638
639
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
640
641
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
642
643
                h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
644
645
#endif
#ifdef SINGLE_PRECISION_COMPLEX
646
647
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
648
649
#endif

650
                tmp1 = _SSE_MUL(h1_imag, x1);
651
#ifdef __ELPA_USE_FMA__
652
                q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
653
#else
654
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
655
#endif
656
                tmp2 = _SSE_MUL(h1_imag, x2);
657
#ifdef __ELPA_USE_FMA__
658
                q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
659
#else
660
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
661
662
663
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
664
                tmp3 = _SSE_MUL(h1_imag, x3);
665
#ifdef __ELPA_USE_FMA__
666
                q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
667
#else
668
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
669
#endif
670
                tmp4 = _SSE_MUL(h1_imag, x4);
671
#ifdef __ELPA_USE_FMA__
672
                q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
673
#else
674
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
675
676
677
678
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
679
680
                h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
                h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
681
682
#endif
#ifdef SINGLE_PRECISION_COMPLEX
683
684
                h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
                h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
685
686
#endif

687
                tmp1 = _SSE_MUL(h2_imag, y1);
688
#ifdef __ELPA_USE_FMA__
689
                q1 = _SSE_ADD(q1, _mm_maddsub_pd(h2_real, y1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
690
#else
691
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h2_real, y1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
692
#endif
693
                tmp2 = _SSE_MUL(h2_imag, y2);
694
#ifdef __ELPA_USE_FMA__
695
                q2 = _SSE_ADD(q2, _mm_maddsub_pd(h2_real, y2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
696
#else
697
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h2_real, y2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
698
699
700
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
701
                tmp3 = _SSE_MUL(h2_imag, y3);
702
#ifdef __ELPA_USE_FMA__
703
                q3 = _SSE_ADD(q3, _mm_maddsub_pd(h2_real, y3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
704
#else
705
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h2_real, y3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
706
#endif
707
                tmp4 = _SSE_MUL(h2_imag, y4);
708
#ifdef __ELPA_USE_FMA__
709
                q4 = _SSE_ADD(q4, _mm_maddsub_pd(h2_real, y4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
710
#else
711
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h2_real, y4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
712
713
714
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

715
716
                _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
717
#ifdef DOUBLE_PRECISION_COMPLEX
718
719
                _SSE_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _SSE_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
720
#endif
721
        }
722
723

#ifdef DOUBLE_PRECISION_COMPLEX
724
725
        h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
726
727
#endif
#ifdef SINGLE_PRECISION_COMPLEX
728
729
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
730
731
732
#endif


733
734
        q1 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+offset]);
735
#ifdef DOUBLE_PRECISION_COMPLEX
736
737
        q3 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
738
739
#endif

740
        tmp1 = _SSE_MUL(h1_imag, x1);
741
#ifdef __ELPA_USE_FMA__
742
        q1 = _SSE_ADD(q1, _mm_maddsub_pd(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
743
#else
744
        q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
745
#endif
746
        tmp2 = _SSE_MUL(h1_imag, x2);
747
#ifdef __ELPA_USE_FMA__
748
        q2 = _SSE_ADD(q2, _mm_maddsub_pd(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
749
#else
750
        q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
751
752
753
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
754
        tmp3 = _SSE_MUL(h1_imag, x3);
755
#ifdef __ELPA_USE_FMA__
756
        q3 = _SSE_ADD(q3, _mm_maddsub_pd(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
757
#else
758
        q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
759
#endif
760
        tmp4 = _SSE_MUL(h1_imag, x4);
761
#ifdef __ELPA_USE_FMA__
762
        q4 = _SSE_ADD(q4, _mm_maddsub_pd(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
763
#else
764
        q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
765
766
767
768
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */


769
770
        _SSE_STORE(&q_dbl[(2*nb*ldq)+0], q1);
        _SSE_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
771
#ifdef DOUBLE_PRECISION_COMPLEX
772
773
        _SSE_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
        _SSE_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
774
775
776
777
778
779
780
#endif
}

#if 0

static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
781
782
783
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
784

785
786
787
788
789
790
        __SSE_DATATYPE x1, x2, x3;
        __SSE_DATATYPE y1, y2, y3;
        __SSE_DATATYPE q1, q2, q3;
        __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3;
        int i=0;
791

792
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
793

794
795