elpa2_tum_kernels_complex_sse-avx_2hv.cpp 92.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
14
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
15
16
17
18
19
20
21
22
23
24
25
26
27
// --------------------------------------------------------------------------------------------------

#include <complex>
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

#ifdef __USE_AVX128__
#undef __AVX__
#endif

//Forward declaration
#ifdef __AVX__
28
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
29
extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
30
extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
31
extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
32
33
#else
extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
34
35
36
extern "C" __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
37
38
#endif

39
#if 0
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
extern "C" __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
	std::complex<double> x1;
	std::complex<double> x2;
	std::complex<double> x3;
	std::complex<double> x4;
	std::complex<double> y1;
	std::complex<double> y2;
	std::complex<double> y3;
	std::complex<double> y4;
	std::complex<double> h1;
	std::complex<double> h2;
	std::complex<double> tau1;
	std::complex<double> tau2;
	int i=0;

	x1 = q[ldq+0];
	x2 = q[ldq+1];
	x3 = q[ldq+2];
	x4 = q[ldq+3];

	h2 = conj(hh[ldh+1]);

	y1 = q[0] + (x1*h2);
	y2 = q[1] + (x2*h2);
	y3 = q[2] + (x3*h2);
	y4 = q[3] + (x4*h2);

	for (i = 2; i < nb; i++)
	{
		h1 = conj(hh[i-1]);
		h2 = conj(hh[ldh+i]);

		x1 += (q[(i*ldq)+0] * h1);
		y1 += (q[(i*ldq)+0] * h2);
		x2 += (q[(i*ldq)+1] * h1);
		y2 += (q[(i*ldq)+1] * h2);
		x3 += (q[(i*ldq)+2] * h1);
		y3 += (q[(i*ldq)+2] * h2);
		x4 += (q[(i*ldq)+3] * h1);
		y4 += (q[(i*ldq)+3] * h2);
	}
	h1 = conj(hh[nb-1]);

	x1 += (q[(nb*ldq)+0] * h1);
	x2 += (q[(nb*ldq)+1] * h1);
	x3 += (q[(nb*ldq)+2] * h1);
	x4 += (q[(nb*ldq)+3] * h1);

	tau1 = hh[0];
	tau2 = hh[ldh];

	h1 = (-1.0)*tau1;

	x1 *= h1;
	x2 *= h1;
	x3 *= h1;
	x4 *= h1;

	h1 = (-1.0)*tau2;
	h2 = (-1.0)*tau2;
	h2 *= s;
	y1 = y1*h1 +x1*h2;
	y2 = y2*h1 +x2*h2;
	y3 = y3*h1 +x3*h2;
	y4 = y4*h1 +x4*h2;

	q[0] += y1;
	q[1] += y2;
	q[2] += y3;
	q[3] += y4;

	h2 = hh[ldh+1];
	q[ldq+0] += (x1 + (y1*h2));
	q[ldq+1] += (x2 + (y2*h2));
	q[ldq+2] += (x3 + (y3*h2));
	q[ldq+3] += (x4 + (y4*h2));

	for (i = 2; i < nb; i++)
	{
		h1 = hh[i-1];
		h2 = hh[ldh+i];

		q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
		q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
		q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
		q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
	}

	h1 = hh[nb-1];
	q[(nb*ldq)+0] += (x1*h1);
	q[(nb*ldq)+1] += (x2*h1);
	q[(nb*ldq)+2] += (x3*h1);
	q[(nb*ldq)+3] += (x4*h1);
}
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#endif

extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

	std::complex<double> s = conj(hh[(ldh)+1])*1.0;
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}
150
151

#ifdef __AVX__
152
153
154
155
156
157
158
159
160
161
#if 1
	for (i = 0; i < nq-4; i+=8)
	{
		hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
	if (nq-i > 0)
	{
		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
#else
162
	for (i = 0; i < nq-4; i+=6)
163
	{
164
		hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
165
	}
166
	if (nq-i > 2)
167
168
169
	{
		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
170
171
172
173
	else if (nq-i > 0)
	{
		hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
174
#endif
175
#else
176
#if 1
177
178
179
180
	for (i = 0; i < nq; i+=4)
	{
		hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#else
	for (i = 0; i < nq-2; i+=3)
	{
		hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
	if (nq-i > 1)
	{
		hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
	else if (nq-i > 0)
	{
		hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
#endif
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#endif
}

#ifdef __AVX__
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;
	double* s_dbl = (double*)(&s);

	__m256d x1, x2, x3, x4;
	__m256d y1, y2, y3, y4;
	__m256d q1, q2, q3, q4;
	__m256d h1_real, h1_imag, h2_real, h2_imag;
	__m256d tmp1, tmp2, tmp3, tmp4;
	int i=0;

	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);

	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
	x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]);

	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
221
#ifndef __FMA4__
222
223
	// conjugate
	h2_imag = _mm256_xor_pd(h2_imag, sign);
224
#endif
225
226
227
228
229
230
231

	y1 = _mm256_load_pd(&q_dbl[0]);
	y2 = _mm256_load_pd(&q_dbl[4]);
	y3 = _mm256_load_pd(&q_dbl[8]);
	y4 = _mm256_load_pd(&q_dbl[12]);

	tmp1 = _mm256_mul_pd(h2_imag, x1);
232
233
234
#ifdef __FMA4__
	y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
235
	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
236
#endif
237
	tmp2 = _mm256_mul_pd(h2_imag, x2);
238
239
240
#ifdef __FMA4__
	y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
241
	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
242
#endif
243
	tmp3 = _mm256_mul_pd(h2_imag, x3);
244
245
246
#ifdef __FMA4__
	y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
247
	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
248
#endif
249
	tmp4 = _mm256_mul_pd(h2_imag, x4);
250
251
252
#ifdef __FMA4__
	y4 = _mm256_add_pd(y4, _mm256_msubadd_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
253
	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
254
#endif
255
256
257
258
259
260
261
262
263
264

	for (i = 2; i < nb; i++)
	{
		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);

		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
265
#ifndef __FMA4__
266
267
		// conjugate
		h1_imag = _mm256_xor_pd(h1_imag, sign);
268
#endif
269
270

		tmp1 = _mm256_mul_pd(h1_imag, q1);
271
272
273
#ifdef __FMA4__
		x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
274
		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
275
#endif
276
		tmp2 = _mm256_mul_pd(h1_imag, q2);
277
278
279
#ifdef __FMA4__
		x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
280
		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
281
#endif
282
		tmp3 = _mm256_mul_pd(h1_imag, q3);
283
284
285
#ifdef __FMA4__
		x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
286
		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
287
#endif
288
		tmp4 = _mm256_mul_pd(h1_imag, q4);
289
290
291
#ifdef __FMA4__
		x4 = _mm256_add_pd(x4, _mm256_msubadd_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
292
		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
293
#endif
294
295
296

		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
297
#ifndef __FMA4__
298
299
		// conjugate
		h2_imag = _mm256_xor_pd(h2_imag, sign);
300
#endif
301
302

		tmp1 = _mm256_mul_pd(h2_imag, q1);
303
304
305
#ifdef __FMA4__
		y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
306
		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
307
#endif
308
		tmp2 = _mm256_mul_pd(h2_imag, q2);
309
310
311
#ifdef __FMA4__
		y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
312
		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
313
#endif
314
		tmp3 = _mm256_mul_pd(h2_imag, q3);
315
316
317
#ifdef __FMA4__
		y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
318
		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
319
#endif
320
		tmp4 = _mm256_mul_pd(h2_imag, q4);
321
322
323
#ifdef __FMA4__
		y4 = _mm256_add_pd(y4, _mm256_msubadd_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
324
		y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
325
#endif
326
327
328
329
	}

	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
330
#ifndef __FMA4__
331
332
	// conjugate
	h1_imag = _mm256_xor_pd(h1_imag, sign);
333
#endif
334
335
336
337
338
339
340

	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);

	tmp1 = _mm256_mul_pd(h1_imag, q1);
341
342
343
#ifdef __FMA4__
	x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
344
	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
345
#endif
346
	tmp2 = _mm256_mul_pd(h1_imag, q2);
347
348
349
#ifdef __FMA4__
	x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
350
	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
351
#endif
352
	tmp3 = _mm256_mul_pd(h1_imag, q3);
353
354
355
#ifdef __FMA4__
	x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
356
	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
357
#endif
358
	tmp4 = _mm256_mul_pd(h1_imag, q4);
359
360
361
#ifdef __FMA4__
	x4 = _mm256_add_pd(x4, _mm256_msubadd_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
362
	x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
363
#endif
364
365
366
367
368
369
370

	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
371
372
373
#ifdef __FMA4__
	x1 = _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
374
	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
375
#endif
376
	tmp2 = _mm256_mul_pd(h1_imag, x2);
377
378
379
#ifdef __FMA4__
	x2 = _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
380
	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
381
#endif
382
	tmp3 = _mm256_mul_pd(h1_imag, x3);
383
384
385
#ifdef __FMA4__
	x3 = _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
386
	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
387
#endif
388
	tmp4 = _mm256_mul_pd(h1_imag, x4);
389
390
391
#ifdef __FMA4__
	x4 = _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#else
392
	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
393
#endif
394
395
396
397
398
399
400
401
402
403
404
405
406
407

	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);

	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);
	h2_real = _mm256_xor_pd(h2_real, sign);
	h2_imag = _mm256_xor_pd(h2_imag, sign);

	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
408
409
410
#ifdef __FMA4__
	tmp2 = _mm256_maddsub_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
411
	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
412
#endif
413
414
415
416
417
	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);

	tmp1 = _mm256_mul_pd(h1_imag, y1);
418
419
420
#ifdef __FMA4__
	y1 = _mm256_maddsub_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
421
	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
422
#endif
423
	tmp2 = _mm256_mul_pd(h1_imag, y2);
424
425
426
#ifdef __FMA4__
	y2 = _mm256_maddsub_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
427
	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
428
#endif
429
	tmp3 = _mm256_mul_pd(h1_imag, y3);
430
431
432
#ifdef __FMA4__
	y3 = _mm256_maddsub_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
433
	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
434
#endif
435
	tmp4 = _mm256_mul_pd(h1_imag, y4);
436
437
438
#ifdef __FMA4__
	y4 = _mm256_maddsub_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#else
439
	y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
440
#endif
441
442

	tmp1 = _mm256_mul_pd(h2_imag, x1);
443
444
445
#ifdef __FMA4__
	y1 = _mm256_add_pd(y1, _mm256_maddsub_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
446
	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
447
#endif
448
	tmp2 = _mm256_mul_pd(h2_imag, x2);
449
450
451
#ifdef __FMA4__
	y2 = _mm256_add_pd(y2, _mm256_maddsub_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
452
	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
453
#endif
454
	tmp3 = _mm256_mul_pd(h2_imag, x3);
455
456
457
#ifdef __FMA4__
	y3 = _mm256_add_pd(y3, _mm256_maddsub_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
458
	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
459
#endif
460
	tmp4 = _mm256_mul_pd(h2_imag, x4);
461
462
463
#ifdef __FMA4__
	y4 = _mm256_add_pd(y4, _mm256_maddsub_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
464
	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
465
#endif
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495

	q1 = _mm256_load_pd(&q_dbl[0]);
	q2 = _mm256_load_pd(&q_dbl[4]);
	q3 = _mm256_load_pd(&q_dbl[8]);
	q4 = _mm256_load_pd(&q_dbl[12]);

	q1 = _mm256_add_pd(q1, y1);
	q2 = _mm256_add_pd(q2, y2);
	q3 = _mm256_add_pd(q3, y3);
	q4 = _mm256_add_pd(q4, y4);

	_mm256_store_pd(&q_dbl[0], q1);
	_mm256_store_pd(&q_dbl[4], q2);
	_mm256_store_pd(&q_dbl[8], q3);
	_mm256_store_pd(&q_dbl[12], q4);

	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);

	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
	q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);

	q1 = _mm256_add_pd(q1, x1);
	q2 = _mm256_add_pd(q2, x2);
	q3 = _mm256_add_pd(q3, x3);
	q4 = _mm256_add_pd(q4, x4);

	tmp1 = _mm256_mul_pd(h2_imag, y1);
496
497
498
#ifdef __FMA4__
	q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
499
	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
500
#endif
501
	tmp2 = _mm256_mul_pd(h2_imag, y2);
502
503
504
#ifdef __FMA4_
	q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
505
	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
506
#endif
507
	tmp3 = _mm256_mul_pd(h2_imag, y3);
508
509
510
#ifdef __FMA4__
	q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
511
	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
512
#endif
513
	tmp4 = _mm256_mul_pd(h2_imag, y4);
514
515
516
#ifdef __FMA4__
	q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
517
	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
518
#endif
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535

	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
	_mm256_store_pd(&q_dbl[(ldq*2)+12], q4);

	for (i = 2; i < nb; i++)
	{
		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);

		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);

		tmp1 = _mm256_mul_pd(h1_imag, x1);
536
537
538
#ifdef __FMA4__
		q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
539
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
540
#endif
541
		tmp2 = _mm256_mul_pd(h1_imag, x2);
542
543
544
#ifdef __FMA4__
		q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
545
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
546
#endif
547
		tmp3 = _mm256_mul_pd(h1_imag, x3);
548
549
550
#ifdef __FMA4__
		q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
551
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
552
#endif
553
		tmp4 = _mm256_mul_pd(h1_imag, x4);
554
555
556
#ifdef __FMA4__
		q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
557
		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
558
#endif
559
560
561
562
563

		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);

		tmp1 = _mm256_mul_pd(h2_imag, y1);
564
565
566
#ifdef __FMA4__
		q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
567
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
568
#endif
569
		tmp2 = _mm256_mul_pd(h2_imag, y2);
570
571
572
#ifdef __FMA4__
		q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
573
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
574
#endif
575
		tmp3 = _mm256_mul_pd(h2_imag, y3);
576
577
578
#ifdef __FMA4__
		q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
579
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
580
#endif
581
		tmp4 = _mm256_mul_pd(h2_imag, y4);
582
583
584
#ifdef __FMA4__
		q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
585
		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
586
#endif
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601

		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
	}
	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);

	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
602
603
604
#ifdef __FMA4__
	q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
605
	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
606
#endif
607
	tmp2 = _mm256_mul_pd(h1_imag, x2);
608
609
610
#ifdef __FMA4__
	q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
611
	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
612
#endif
613
	tmp3 = _mm256_mul_pd(h1_imag, x3);
614
615
616
#ifdef __FMA4__
	q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
617
	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
618
#endif
619
	tmp4 = _mm256_mul_pd(h1_imag, x4);
620
621
622
#ifdef __FMA4__
	q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
623
	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
624
#endif
625
626
627
628
629
630

	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
	_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
}
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652

extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;
	double* s_dbl = (double*)(&s);

	__m256d x1, x2, x3;
	__m256d y1, y2, y3;
	__m256d q1, q2, q3;
	__m256d h1_real, h1_imag, h2_real, h2_imag;
	__m256d tmp1, tmp2, tmp3;
	int i=0;

	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);

	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);

	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
653
#ifndef __FMA4__
654
655
	// conjugate
	h2_imag = _mm256_xor_pd(h2_imag, sign);
656
#endif
657
658
659
660
661
662

	y1 = _mm256_load_pd(&q_dbl[0]);
	y2 = _mm256_load_pd(&q_dbl[4]);
	y3 = _mm256_load_pd(&q_dbl[8]);

	tmp1 = _mm256_mul_pd(h2_imag, x1);
663
664
665
#ifdef __FMA4__
	y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
666
	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
667
#endif
668
	tmp2 = _mm256_mul_pd(h2_imag, x2);
669
670
671
#ifdef __FMA4__
	y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
672
	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
673
#endif
674
	tmp3 = _mm256_mul_pd(h2_imag, x3);
675
676
677
#ifdef __FMA4__
	y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
678
	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
679
#endif
680
681
682
683
684
685
686
687
688

	for (i = 2; i < nb; i++)
	{
		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);

		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
689
#ifndef __FMA4__
690
691
		// conjugate
		h1_imag = _mm256_xor_pd(h1_imag, sign);
692
#endif
693
694

		tmp1 = _mm256_mul_pd(h1_imag, q1);
695
696
697
#ifdef __FMA4__
		x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
698
		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
699
#endif
700
		tmp2 = _mm256_mul_pd(h1_imag, q2);
701
702
703
#ifdef __FMA4__
		x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
704
		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
705
#endif
706
		tmp3 = _mm256_mul_pd(h1_imag, q3);
707
708
709
#ifdef __FMA4__
		x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
710
		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
711
#endif
712
713
714

		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
715
#ifndef __FMA4__
716
717
		// conjugate
		h2_imag = _mm256_xor_pd(h2_imag, sign);
718
#endif
719
720

		tmp1 = _mm256_mul_pd(h2_imag, q1);
721
722
723
#ifdef __FMA4__
		y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
724
		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
725
#endif
726
		tmp2 = _mm256_mul_pd(h2_imag, q2);
727
728
729
#ifdef __FMA4__
		y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
730
		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
731
#endif
732
		tmp3 = _mm256_mul_pd(h2_imag, q3);
733
734
735
#ifdef __FMA4__
		y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
736
		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
737
#endif
738
739
740
741
	}

	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
742
#ifndef __FMA4__
743
744
	// conjugate
	h1_imag = _mm256_xor_pd(h1_imag, sign);
745
#endif
746
747
748
749
750
751

	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);

	tmp1 = _mm256_mul_pd(h1_imag, q1);
752
753
754
#ifdef __FMA4__
	x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
755
	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
756
#endif
757
	tmp2 = _mm256_mul_pd(h1_imag, q2);
758
759
760
#ifdef __FMA4__
	x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
761
	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
762
#endif
763
	tmp3 = _mm256_mul_pd(h1_imag, q3);
764
765
766
#ifdef __FMA4__
	x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
767
	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
768
#endif
769
770
771
772
773
774
775

	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
776
777
778
#ifdef __FMA4__
	x1 = _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
779
	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
780
#endif
781
	tmp2 = _mm256_mul_pd(h1_imag, x2);
782
783
784
#ifdef __FMA4__
	x2 = _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
785
	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
786
#endif
787
	tmp3 = _mm256_mul_pd(h1_imag, x3);
788
789
790
#ifdef __FMA4__
	x3 = _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
791
	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
792
#endif
793
794
795
796
797
798
799
800
801
802
803
804
805
806

	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);

	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);
	h2_real = _mm256_xor_pd(h2_real, sign);
	h2_imag = _mm256_xor_pd(h2_imag, sign);

	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
807
808
809
#ifdef __FMA4__
	tmp2 = _mm256_maddsub_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
810
	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
811
#endif
812
813
814
815
816
	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);

	tmp1 = _mm256_mul_pd(h1_imag, y1);
817
818
819
#ifdef __FMA4__
	y1 = _mm256_maddsub_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
820
	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
821
#endif
822
	tmp2 = _mm256_mul_pd(h1_imag, y2);
823
824
825
#ifdef __FMA4__
	y2 = _mm256_maddsub_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
826
	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
827
#endif
828
	tmp3 = _mm256_mul_pd(h1_imag, y3);
829
830
831
#ifdef __FMA4__
	y3 = _mm256_maddsub_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
832
	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
833
#endif
834
835

	tmp1 = _mm256_mul_pd(h2_imag, x1);
836
837
838
#ifdef __FMA4__
	y1 = _mm256_add_pd(y1, _mm256_maddsub_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
839
	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
840
#endif
841
	tmp2 = _mm256_mul_pd(h2_imag, x2);
842
843
844
#ifdef __FMA4__
	y2 = _mm256_add_pd(y2, _mm256_maddsub_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
845
	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
846
#endif
847
	tmp3 = _mm256_mul_pd(h2_imag, x3);
848
849
850
#ifdef __FMA4__
	y3 = _mm256_add_pd(y3, _mm256_maddsub_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
851
	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
852
#endif
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877

	q1 = _mm256_load_pd(&q_dbl[0]);
	q2 = _mm256_load_pd(&q_dbl[4]);
	q3 = _mm256_load_pd(&q_dbl[8]);

	q1 = _mm256_add_pd(q1, y1);
	q2 = _mm256_add_pd(q2, y2);
	q3 = _mm256_add_pd(q3, y3);

	_mm256_store_pd(&q_dbl[0], q1);
	_mm256_store_pd(&q_dbl[4], q2);
	_mm256_store_pd(&q_dbl[8], q3);

	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);

	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);

	q1 = _mm256_add_pd(q1, x1);
	q2 = _mm256_add_pd(q2, x2);
	q3 = _mm256_add_pd(q3, x3);

	tmp1 = _mm256_mul_pd(h2_imag, y1);
878
879
880
#ifdef __FMA4__
	q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
881
	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
882
#endif
883
	tmp2 = _mm256_mul_pd(h2_imag, y2);
884
885
886
#ifdef __FMA4_
	q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
887
	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
888
#endif
889
	tmp3 = _mm256_mul_pd(h2_imag, y3);
890
891
892
#ifdef __FMA4__
	q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
893
	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
894
#endif
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909

	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);

	for (i = 2; i < nb; i++)
	{
		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);

		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);

		tmp1 = _mm256_mul_pd(h1_imag, x1);
910
911
912
#ifdef __FMA4__
		q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
913
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
914
#endif
915
		tmp2 = _mm256_mul_pd(h1_imag, x2);
916
917
918
#ifdef __FMA4__
		q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
919
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
920
#endif
921
		tmp3 = _mm256_mul_pd(h1_imag, x3);
922
923
924
#ifdef __FMA4__
		q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
925
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
926
#endif
927
928
929
930
931

		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);

		tmp1 = _mm256_mul_pd(h2_imag, y1);
932
933
934
#ifdef __FMA4__
		q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
935
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
936
#endif
937
		tmp2 = _mm256_mul_pd(h2_imag, y2);
938
939
940
#ifdef __FMA4__
		q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
941
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
942
#endif
943
		tmp3 = _mm256_mul_pd(h2_imag, y3);
944
945
946
#ifdef __FMA4__
		q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
947
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
948
#endif
949
950
951
952
953
954
955
956
957
958
959
960
961

		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
	}
	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);

	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+<