elpa2_kernels_complex_avx512_1hv_double_precision.c 14.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
Andreas Marek's avatar
Andreas Marek committed
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
46
// --------------------------------------------------------------------------------------------------
47
48


49
50
51
52
53
54
55
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

56
#ifdef HAVE_AVX512
57
58

#define __ELPA_USE_FMA__
59
60
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
61
62
63
64

#endif

//Forward declaration
65
66
static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
67
68
69
70
71
72
static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
73
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
74
75
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
76
77
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                 :: q
78
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
79
80
81
82
83
84
85
86
87
88
89
90
91
!f>   end subroutine
!f> end interface
!f>#endif
*/

void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	//int ldh = *pldh;

92
	for (i = 0; i < nq-16; i+=24)
93
	{
94
		hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
95
	}
96
	if (nq == i)
97
	{
98
99
100
101
102
		return;
	}
	if (nq-i == 16)
	{
		hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
103
	}
104
	else
105
	{
106
		hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
107
108
109
	}
}

110
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
111
112
113
114
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

115
116
117
118
	__m512d x1, x2, x3, x4, x5, x6;
	__m512d q1, q2, q3, q4, q5, q6;
	__m512d h1_real, h1_imag;
	__m512d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
119
120
	int i=0;

121
	__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
122

123
124
125
126
127
128
	x1 = _mm512_load_pd(&q_dbl[0]);    // complex 1, 2, 3, 4
	x2 = _mm512_load_pd(&q_dbl[8]);    // complex 5, 6, 7, 8
	x3 = _mm512_load_pd(&q_dbl[16]);   // complex 9, 10, 11, 12
	x4 = _mm512_load_pd(&q_dbl[24]);   // complex 13, 14, 15, 16
	x5 = _mm512_load_pd(&q_dbl[32]);   // complex 17, 18, 19, 20
	x6 = _mm512_load_pd(&q_dbl[40]);   // complex 21, 22, 23, 24
129
130
131

	for (i = 1; i < nb; i++)
	{
132
133
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
134

135
136
137
138
139
140
141
142
143
		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
		q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
		q5 = _mm512_load_pd(&q_dbl[(2*i*ldq)+32]);
		q6 = _mm512_load_pd(&q_dbl[(2*i*ldq)+40]);

		tmp1 = _mm512_mul_pd(h1_imag, q1);

144
		x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
145
146
147

		tmp2 = _mm512_mul_pd(h1_imag, q2);

148
		x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
149
150
151

		tmp3 = _mm512_mul_pd(h1_imag, q3);

152
		x3 = _mm512_add_pd(x3, _mm512_FMSUBADD_pd(h1_real, q3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
153
154
155

		tmp4 = _mm512_mul_pd(h1_imag, q4);

156
		x4 = _mm512_add_pd(x4, _mm512_FMSUBADD_pd(h1_real, q4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
157
158
159

		tmp5 = _mm512_mul_pd(h1_imag, q5);

160
		x5 = _mm512_add_pd(x5, _mm512_FMSUBADD_pd(h1_real, q5, _mm512_shuffle_pd(tmp5, tmp5, 0x55)));
161
162
163

		tmp6 = _mm512_mul_pd(h1_imag, q6);

164
		x6 = _mm512_add_pd(x6, _mm512_FMSUBADD_pd(h1_real, q6, _mm512_shuffle_pd(tmp6, tmp6, 0x55)));
165
166
	}

167
168
	h1_real = _mm512_set1_pd(hh_dbl[0]);
	h1_imag = _mm512_set1_pd(hh_dbl[1]);
169

170
171
172
173
174
175
176
177
//	h1_real = _mm512_xor_pd(h1_real, sign);
//	h1_imag = _mm512_xor_pd(h1_imag, sign);

        h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);

	tmp1 = _mm512_mul_pd(h1_imag, x1);

178
	x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
179
180
181

	tmp2 = _mm512_mul_pd(h1_imag, x2);

182
	x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
183
184
185

	tmp3 = _mm512_mul_pd(h1_imag, x3);

186
	x3 = _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55));
187
188
189

	tmp4 = _mm512_mul_pd(h1_imag, x4);

190
	x4 = _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55));
191
192
193

	tmp5 = _mm512_mul_pd(h1_imag, x5);

194
	x5 = _mm512_FMADDSUB_pd(h1_real, x5, _mm512_shuffle_pd(tmp5, tmp5, 0x55));
195
196

	tmp6 = _mm512_mul_pd(h1_imag, x6);
197

198
	x6 = _mm512_FMADDSUB_pd(h1_real, x6, _mm512_shuffle_pd(tmp6, tmp6, 0x55));
199
200
201
202
203

	q1 = _mm512_load_pd(&q_dbl[0]);
	q2 = _mm512_load_pd(&q_dbl[8]);
	q3 = _mm512_load_pd(&q_dbl[16]);
	q4 = _mm512_load_pd(&q_dbl[24]);
204
205
	q5 = _mm512_load_pd(&q_dbl[32]);
	q6 = _mm512_load_pd(&q_dbl[40]);
206
207
208
209
210
211
212
213
214
215
216
217
218
219

	q1 = _mm512_add_pd(q1, x1);
	q2 = _mm512_add_pd(q2, x2);
	q3 = _mm512_add_pd(q3, x3);
	q4 = _mm512_add_pd(q4, x4);
	q5 = _mm512_add_pd(q5, x5);
	q6 = _mm512_add_pd(q6, x6);

	_mm512_store_pd(&q_dbl[0], q1);
	_mm512_store_pd(&q_dbl[8], q2);
	_mm512_store_pd(&q_dbl[16], q3);
	_mm512_store_pd(&q_dbl[24], q4);
	_mm512_store_pd(&q_dbl[32], q5);
	_mm512_store_pd(&q_dbl[40], q6);
220
221
222

	for (i = 1; i < nb; i++)
	{
223
224
225
226
227
228
229
230
231
232
233
234
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);

		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
		q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
		q5 = _mm512_load_pd(&q_dbl[(2*i*ldq)+32]);
		q6 = _mm512_load_pd(&q_dbl[(2*i*ldq)+40]);

		tmp1 = _mm512_mul_pd(h1_imag, x1);

235
		q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
236
237
238

		tmp2 = _mm512_mul_pd(h1_imag, x2);

239
		q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
240

241
242
		tmp3 = _mm512_mul_pd(h1_imag, x3);

243
		q3 = _mm512_add_pd(q3, _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
244
245
246

		tmp4 = _mm512_mul_pd(h1_imag, x4);

247
		q4 = _mm512_add_pd(q4, _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
248
249
250

		tmp5 = _mm512_mul_pd(h1_imag, x5);

251
		q5 = _mm512_add_pd(q5, _mm512_FMADDSUB_pd(h1_real, x5, _mm512_shuffle_pd(tmp5, tmp5, 0x55)));
252
253
254

		tmp6 = _mm512_mul_pd(h1_imag, x6);

255
		q6 = _mm512_add_pd(q6, _mm512_FMADDSUB_pd(h1_real, x6, _mm512_shuffle_pd(tmp6, tmp6, 0x55)));
256
257
258
259
260
261
262

		_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+16], q3);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+24], q4);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+32], q5);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+40], q6);
263
264
265
	}
}

266
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
267
268
269
270
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

271
272
273
274
	__m512d x1, x2, x3, x4;
	__m512d q1, q2, q3, q4;
	__m512d h1_real, h1_imag;
	__m512d tmp1, tmp2, tmp3, tmp4;
275
276
	int i=0;

277
	__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
278

279
280
281
282
	x1 = _mm512_load_pd(&q_dbl[0]);   // complex 1 2 3 4
	x2 = _mm512_load_pd(&q_dbl[8]);
	x3 = _mm512_load_pd(&q_dbl[16]);
	x4 = _mm512_load_pd(&q_dbl[24]);  // comlex 13 14 15 16
283
284
285

	for (i = 1; i < nb; i++)
	{
286
287
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
288

289
290
291
292
		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
		q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
293

294
295
		tmp1 = _mm512_mul_pd(h1_imag, q1);

296
		x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
297
298
299

		tmp2 = _mm512_mul_pd(h1_imag, q2);

300
		x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
301
302
303

		tmp3 = _mm512_mul_pd(h1_imag, q3);

304
		x3 = _mm512_add_pd(x3, _mm512_FMSUBADD_pd(h1_real, q3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
305
306
307

		tmp4 = _mm512_mul_pd(h1_imag, q4);

308
		x4 = _mm512_add_pd(x4, _mm512_FMSUBADD_pd(h1_real, q4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
309
310
	}

311
312
	h1_real = _mm512_set1_pd(hh_dbl[0]);
	h1_imag = _mm512_set1_pd(hh_dbl[1]);
313

314
315
316
317
318
319
320
//	h1_real = _mm512_xor_pd(h1_real, sign);
//	h1_imag = _mm512_xor_pd(h1_imag, sign);
        h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);

	tmp1 = _mm512_mul_pd(h1_imag, x1);

321
	x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
322
323
324

	tmp2 = _mm512_mul_pd(h1_imag, x2);

325
	x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
326
327

	tmp3 = _mm512_mul_pd(h1_imag, x3);
328

329
	x3 = _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55));
330

331
	tmp4 = _mm512_mul_pd(h1_imag, x4);
332

333
	x4 = _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55));
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

	q1 = _mm512_load_pd(&q_dbl[0]);
	q2 = _mm512_load_pd(&q_dbl[8]);
	q3 = _mm512_load_pd(&q_dbl[16]);
	q4 = _mm512_load_pd(&q_dbl[24]);

	q1 = _mm512_add_pd(q1, x1);
	q2 = _mm512_add_pd(q2, x2);
	q3 = _mm512_add_pd(q3, x3);
	q4 = _mm512_add_pd(q4, x4);

	_mm512_store_pd(&q_dbl[0], q1);
	_mm512_store_pd(&q_dbl[8], q2);
	_mm512_store_pd(&q_dbl[16], q3);
	_mm512_store_pd(&q_dbl[24], q4);
349
350
351

	for (i = 1; i < nb; i++)
	{
352
353
354
355
356
357
358
359
360
361
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);

		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
		q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);

		tmp1 = _mm512_mul_pd(h1_imag, x1);

362
		q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
363
364
365

		tmp2 = _mm512_mul_pd(h1_imag, x2);

366
		q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
367
368
369

		tmp3 = _mm512_mul_pd(h1_imag, x3);

370
		q3 = _mm512_add_pd(q3, _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
371
372

		tmp4 = _mm512_mul_pd(h1_imag, x4);
373

374
		q4 = _mm512_add_pd(q4, _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
375
376
377
378
379

		_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+16], q3);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+24], q4);
380
381
382
	}
}

383
static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
384
385
386
387
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

388
389
390
391
	__m512d x1, x2;
	__m512d q1, q2;
	__m512d h1_real, h1_imag;
	__m512d tmp1, tmp2;
392
393
	int i=0;

394
	__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
395

396
397
	x1 = _mm512_load_pd(&q_dbl[0]);
	x2 = _mm512_load_pd(&q_dbl[8]);
398
399
400

	for (i = 1; i < nb; i++)
	{
401
402
403
404
405
406
407
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);

		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);

		tmp1 = _mm512_mul_pd(h1_imag, q1);
408
		x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
409
		tmp2 = _mm512_mul_pd(h1_imag, q2);
410
		x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
411
	}
412

413
414
	h1_real = _mm512_set1_pd(hh_dbl[0]);
	h1_imag = _mm512_set1_pd(hh_dbl[1]);
415

416
417
418
419
//	h1_real = _mm512_xor_pd(h1_real, sign);
//	h1_imag = _mm512_xor_pd(h1_imag, sign);
	h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
	h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
420

421
	tmp1 = _mm512_mul_pd(h1_imag, x1);
422
	x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
423

424
	tmp2 = _mm512_mul_pd(h1_imag, x2);
425

426
	x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
427

428
429
	q1 = _mm512_load_pd(&q_dbl[0]);
	q2 = _mm512_load_pd(&q_dbl[8]);
430

431
432
433
434
435
	q1 = _mm512_add_pd(q1, x1);
	q2 = _mm512_add_pd(q2, x2);

	_mm512_store_pd(&q_dbl[0], q1);
	_mm512_store_pd(&q_dbl[8], q2);
436
437
438

	for (i = 1; i < nb; i++)
	{
439
440
		h1_real = _mm512_set1_pd(hh_dbl[i*2]);
		h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
441

442
443
		q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
444

445
		tmp1 = _mm512_mul_pd(h1_imag, x1);
446
		q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
447
448
449

		tmp2 = _mm512_mul_pd(h1_imag, x2);

450
		q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
451

452
453
		_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
454
455
	}
}