elpa2_kernels_complex_avx512_1hv_single_precision.c 15 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
Andreas Marek's avatar
Andreas Marek committed
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
62
63
64
65
66
67
68
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

69
#ifdef HAVE_AVX512
70
71

#define __ELPA_USE_FMA__
72
73
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
74
75
76
77

#endif

//Forward declaration
78
79
80
static  __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
81
82

/*
83
!f>#if defined(HAVE_AVX512)
84
85
86
87
88
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
89
90
!f>     complex(kind=c_float)     :: q(*)
!f>     complex(kind=c_float)     :: hh(pnb,2)
91
92
93
94
95
96
97
98
99
100
101
102
103
!f>   end subroutine
!f> end interface
!f>#endif
*/

void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	//int ldh = *pldh;

104
	for (i = 0; i < nq-32; i+=48)
105
	{
106
		hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
107
108
109
110
111
	}
	if (nq == i)
	{
		return;
	}
112
	if (nq-i == 32)
113
	{
114
		hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
115
116
117
	}
	else
	{
118
		hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
119
120
121
	}
}

122
static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
123
124
125
126
{
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;

127
128
129
130
	__m512 x1, x2, x3, x4, x5, x6;
	__m512 q1, q2, q3, q4, q5, q6;
	__m512 h1_real, h1_imag;
	__m512 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
131
132
	int i=0;

133
	__m512 sign = (__m512d)_mm512_set1_epi32(0x80000000);
134

135
136
137
138
139
140
	x1 = _mm512_load_ps(&q_dbl[0]);    // complex 1, 2, 3, 4, 5, 6, 7, 8
	x2 = _mm512_load_ps(&q_dbl[16]);   // complex 9, 10, 11, 12, 13, 14, 15, 16
	x3 = _mm512_load_ps(&q_dbl[32]);   // complex 17 ...24
	x4 = _mm512_load_ps(&q_dbl[48]);   // complex 25 .. 32
	x5 = _mm512_load_ps(&q_dbl[54]);   // complex 33 .. 40
	x6 = _mm512_load_ps(&q_dbl[60]);   // complex 40 .. 48
141
142
143

	for (i = 1; i < nb; i++)
	{
144
145
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);
146

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
		q3 = _mm512_load_ps(&q_dbl[(2*i*ldq)+32]);
		q4 = _mm512_load_ps(&q_dbl[(2*i*ldq)+48]);
		q5 = _mm512_load_ps(&q_dbl[(2*i*ldq)+54]);
		q6 = _mm512_load_ps(&q_dbl[(2*i*ldq)+60]);

		tmp1 = _mm512_mul_ps(h1_imag, q1);
		x1 = _mm512_add_ps(x1, _mm512_FMSUBADD_ps(h1_real, q1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));

		tmp2 = _mm512_mul_ps(h1_imag, q2);

		x2 = _mm512_add_ps(x2, _mm512_FMSUBADD_ps(h1_real, q2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));

		tmp3 = _mm512_mul_ps(h1_imag, q3);

		x3 = _mm512_add_ps(x3, _mm512_FMSUBADD_ps(h1_real, q3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1)));

		tmp4 = _mm512_mul_ps(h1_imag, q4);

		x4 = _mm512_add_ps(x4, _mm512_FMSUBADD_ps(h1_real, q4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1)));

		tmp5 = _mm512_mul_ps(h1_imag, q5);

		x5 = _mm512_add_ps(x5, _mm512_FMSUBADD_ps(h1_real, q5, _mm512_shuffle_ps(tmp5, tmp5, 0xb1)));

		tmp6 = _mm512_mul_ps(h1_imag, q6);

		x6 = _mm512_add_ps(x6, _mm512_FMSUBADD_ps(h1_real, q6, _mm512_shuffle_ps(tmp6, tmp6, 0xb1)));
176
177
	}

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
	h1_real = _mm512_set1_ps(hh_dbl[0]);
	h1_imag = _mm512_set1_ps(hh_dbl[1]);

//	h1_real = _mm512_xor_ps(h1_real, sign);
//	h1_imag = _mm512_xor_ps(h1_imag, sign);

        h1_real = (__m512) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__m512) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);

	tmp1 = _mm512_mul_ps(h1_imag, x1);

	x1 = _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1));

	tmp2 = _mm512_mul_ps(h1_imag, x2);

	x2 = _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1));

	tmp3 = _mm512_mul_ps(h1_imag, x3);

	x3 = _mm512_FMADDSUB_ps(h1_real, x3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1));

	tmp4 = _mm512_mul_ps(h1_imag, x4);

	x4 = _mm512_FMADDSUB_ps(h1_real, x4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1));

	tmp5 = _mm512_mul_ps(h1_imag, x5);

	x5 = _mm512_FMADDSUB_ps(h1_real, x5, _mm512_shuffle_ps(tmp5, tmp5, 0xb1));
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
	tmp6 = _mm512_mul_ps(h1_imag, x6);

	x6 = _mm512_FMADDSUB_ps(h1_real, x6, _mm512_shuffle_ps(tmp6, tmp6, 0xb1));

	q1 = _mm512_load_ps(&q_dbl[0]);
	q2 = _mm512_load_ps(&q_dbl[16]);
	q3 = _mm512_load_ps(&q_dbl[32]);
	q4 = _mm512_load_ps(&q_dbl[48]);
	q5 = _mm512_load_ps(&q_dbl[54]);
	q6 = _mm512_load_ps(&q_dbl[60]);

	q1 = _mm512_add_ps(q1, x1);
	q2 = _mm512_add_ps(q2, x2);
	q3 = _mm512_add_ps(q3, x3);
	q4 = _mm512_add_ps(q4, x4);
	q5 = _mm512_add_ps(q5, x5);
	q6 = _mm512_add_ps(q6, x6);

	_mm512_store_ps(&q_dbl[0], q1);
	_mm512_store_ps(&q_dbl[16], q2);
	_mm512_store_ps(&q_dbl[32], q3);
	_mm512_store_ps(&q_dbl[48], q4);
	_mm512_store_ps(&q_dbl[54], q5);
	_mm512_store_ps(&q_dbl[60], q6);
231
232
233

	for (i = 1; i < nb; i++)
	{
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);

		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
		q3 = _mm512_load_ps(&q_dbl[(2*i*ldq)+32]);
		q4 = _mm512_load_ps(&q_dbl[(2*i*ldq)+48]);
		q5 = _mm512_load_ps(&q_dbl[(2*i*ldq)+54]);
		q6 = _mm512_load_ps(&q_dbl[(2*i*ldq)+60]);

		tmp1 = _mm512_mul_ps(h1_imag, x1);

		q1 = _mm512_add_ps(q1, _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));

		tmp2 = _mm512_mul_ps(h1_imag, x2);
249

250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
		q2 = _mm512_add_ps(q2, _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));

		tmp3 = _mm512_mul_ps(h1_imag, x3);

		q3 = _mm512_add_ps(q3, _mm512_FMADDSUB_ps(h1_real, x3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1)));

		tmp4 = _mm512_mul_ps(h1_imag, x4);

		q4 = _mm512_add_ps(q4, _mm512_FMADDSUB_ps(h1_real, x4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1)));

		tmp5 = _mm512_mul_ps(h1_imag, x5);

		q5 = _mm512_add_ps(q5, _mm512_FMADDSUB_ps(h1_real, x5, _mm512_shuffle_ps(tmp5, tmp5, 0xb1)));

		tmp6 = _mm512_mul_ps(h1_imag, x6);

		q6 = _mm512_add_ps(q6, _mm512_FMADDSUB_ps(h1_real, x6, _mm512_shuffle_ps(tmp6, tmp6, 0xb1)));

		_mm512_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+16], q2);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+32], q3);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+48], q4);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+54], q5);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+60], q6);
274
275
276
	}
}

277
static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
278
279
280
281
{
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;

282
283
284
285
	__m512 x1, x2, x3, x4;
	__m512 q1, q2, q3, q4;
	__m512 h1_real, h1_imag;
	__m512 tmp1, tmp2, tmp3, tmp4;
286
287
	int i=0;

288
	__m512 sign = (__m512)_mm512_set_epi32(0x80000000);
289

290
291
292
293
	x1 = _mm512_load_ps(&q_dbl[0]);   // complex 1 2 3 4 5 6 7 8
	x2 = _mm512_load_ps(&q_dbl[16]);
	x3 = _mm512_load_ps(&q_dbl[32]);
	x4 = _mm512_load_ps(&q_dbl[48]);  // comlex 24 ..32
294
295
296

	for (i = 1; i < nb; i++)
	{
297
298
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);
299

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
		q3 = _mm512_load_ps(&q_dbl[(2*i*ldq)+32]);
		q4 = _mm512_load_ps(&q_dbl[(2*i*ldq)+48]);

		tmp1 = _mm512_mul_ps(h1_imag, q1);

		x1 = _mm512_add_ps(x1, _mm512_FMSUBADD_ps(h1_real, q1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));

		tmp2 = _mm512_mul_ps(h1_imag, q2);

		x2 = _mm512_add_ps(x2, _mm512_FMSUBADD_ps(h1_real, q2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));

		tmp3 = _mm512_mul_ps(h1_imag, q3);

		x3 = _mm512_add_ps(x3, _mm512_FMSUBADD_ps(h1_real, q3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1)));

		tmp4 = _mm512_mul_ps(h1_imag, q4);

		x4 = _mm512_add_ps(x4, _mm512_FMSUBADD_ps(h1_real, q4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1)));
320
321
	}

322
323
	h1_real = _mm512_set1_ps(hh_dbl[0]);
	h1_imag = _mm512_set1_ps(hh_dbl[1]);
324

325
326
327
328
//	h1_real = _mm512_xor_ps(h1_real, sign);
//	h1_imag = _mm512_xor_ps(h1_imag, sign);
        h1_real = (__m512) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__m512) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
329

330
331
332
333
334
335
336
337
338
	tmp1 = _mm512_mul_ps(h1_imag, x1);

	x1 = _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1));

	tmp2 = _mm512_mul_ps(h1_imag, x2);

	x2 = _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1));

	tmp3 = _mm512_mul_ps(h1_imag, x3);
339

340
	x3 = _mm512_FMADDSUB_ps(h1_real, x3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1));
341

342
	tmp4 = _mm512_mul_ps(h1_imag, x4);
343

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
	x4 = _mm512_FMADDSUB_ps(h1_real, x4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1));

	q1 = _mm512_load_ps(&q_dbl[0]);
	q2 = _mm512_load_ps(&q_dbl[16]);
	q3 = _mm512_load_ps(&q_dbl[32]);
	q4 = _mm512_load_ps(&q_dbl[48]);

	q1 = _mm512_add_ps(q1, x1);
	q2 = _mm512_add_ps(q2, x2);
	q3 = _mm512_add_ps(q3, x3);
	q4 = _mm512_add_ps(q4, x4);

	_mm512_store_ps(&q_dbl[0], q1);
	_mm512_store_ps(&q_dbl[16], q2);
	_mm512_store_ps(&q_dbl[32], q3);
	_mm512_store_ps(&q_dbl[48], q4);
360
361
362

	for (i = 1; i < nb; i++)
	{
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);

		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
		q3 = _mm512_load_ps(&q_dbl[(2*i*ldq)+32]);
		q4 = _mm512_load_ps(&q_dbl[(2*i*ldq)+48]);

		tmp1 = _mm512_mul_ps(h1_imag, x1);

		q1 = _mm512_add_ps(q1, _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));

		tmp2 = _mm512_mul_ps(h1_imag, x2);

		q2 = _mm512_add_ps(q2, _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));

		tmp3 = _mm512_mul_ps(h1_imag, x3);

		q3 = _mm512_add_ps(q3, _mm512_FMADDSUB_ps(h1_real, x3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1)));

		tmp4 = _mm512_mul_ps(h1_imag, x4);
384

385
386
387
388
389
390
		q4 = _mm512_add_ps(q4, _mm512_FMADDSUB_ps(h1_real, x4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1)));

		_mm512_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+16], q2);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+32], q3);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+48], q4);
391
392
393
	}
}

394
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
395
396
397
398
{
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;

399
400
401
402
	__m512 x1, x2;
	__m512 q1, q2;
	__m512 h1_real, h1_imag;
	__m512 tmp1, tmp2;
403
404
	int i=0;

405
	__m512 sign = (__m512)_mm512_set_epi32(0x80000000);
406

407
408
	x1 = _mm512_load_ps(&q_dbl[0]);
	x2 = _mm512_load_ps(&q_dbl[16]);
409
410
411

	for (i = 1; i < nb; i++)
	{
412
413
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);
414

415
416
		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
417

418
419
420
421
		tmp1 = _mm512_mul_ps(h1_imag, q1);
		x1 = _mm512_add_ps(x1, _mm512_FMSUBADD_ps(h1_real, q1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));
		tmp2 = _mm512_mul_ps(h1_imag, q2);
		x2 = _mm512_add_ps(x2, _mm512_FMSUBADD_ps(h1_real, q2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));
422
423
	}

424
425
	h1_real = _mm512_set1_ps(hh_dbl[0]);
	h1_imag = _mm512_set1_ps(hh_dbl[1]);
426

427
428
429
430
//	h1_real = _mm512_xor_ps(h1_real, sign);
//	h1_imag = _mm512_xor_ps(h1_imag, sign);
	h1_real = (__m512) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
	h1_imag = (__m512) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
431

432
433
	tmp1 = _mm512_mul_ps(h1_imag, x1);
	x1 = _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1));
434

435
	tmp2 = _mm512_mul_ps(h1_imag, x2);
436

437
438
439
440
441
442
443
444
445
446
	x2 = _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1));

	q1 = _mm512_load_ps(&q_dbl[0]);
	q2 = _mm512_load_ps(&q_dbl[16]);

	q1 = _mm512_add_ps(q1, x1);
	q2 = _mm512_add_ps(q2, x2);

	_mm512_store_ps(&q_dbl[0], q1);
	_mm512_store_ps(&q_dbl[16], q2);
447
448
449

	for (i = 1; i < nb; i++)
	{
450
451
		h1_real = _mm512_set1_ps(hh_dbl[i*2]);
		h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);
452

453
454
		q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
455

456
457
458
459
460
461
		tmp1 = _mm512_mul_ps(h1_imag, x1);
		q1 = _mm512_add_ps(q1, _mm512_FMADDSUB_ps(h1_real, x1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));

		tmp2 = _mm512_mul_ps(h1_imag, x2);

		q2 = _mm512_add_ps(q2, _mm512_FMADDSUB_ps(h1_real, x2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));
462

463
464
		_mm512_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm512_store_ps(&q_dbl[(2*i*ldq)+16], q2);
465
466
	}
}