complex_avx512_2hv_template.c 50.7 KB
Newer Older
1
//    This file is part of ELPA.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
51
#include <stdio.h>
Andreas Marek's avatar
Andreas Marek committed
52
#include <stdlib.h>
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8

#define __AVX512_DATATYPE __m512d
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_SET _mm512_set_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
Andreas Marek's avatar
Andreas Marek committed
68
69
70
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#define _SHUFFLE 0x55

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
#define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16

#define __AVX512_DATATYPE __m512
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_SET _mm512_set_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
Andreas Marek's avatar
Andreas Marek committed
96
97
98
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#define _SHUFFLE 0xb1

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
#define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */

//Forward declaration
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
115
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
116
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
117
static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
118
119
120
121
#endif

#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
122
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
123
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
124
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#endif

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_float_complex)     :: q(*)
!f>     type(c_ptr), value                  :: q
!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef DOUBLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx512_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
164
165
166
167
168
169
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
170

Andreas Marek's avatar
Andreas Marek committed
171
        worked_on = 0;
172
173

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
174
        double complex s = conj(hh[(ldh)+1])*1.0;
175
176
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
177
        float complex s = conj(hh[(ldh)+1])*1.0f;
178
#endif
Andreas Marek's avatar
Andreas Marek committed
179
180
181
182
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * conj(hh[(i+ldh)]);
        }
183
184

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
185
186
187
188
189
        for (i = 0; i < nq-12; i+=16)
        {
                hh_trafo_complex_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
190
191
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
192
193
194
195
        for (i = 0; i < nq-24; i+=32)
        {
                hh_trafo_complex_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 32;
196

Andreas Marek's avatar
Andreas Marek committed
197
        }
198
#endif
Andreas Marek's avatar
Andreas Marek committed
199
200
201
        if (nq-i == 0) {
                return;
        }
202
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
203
        if (nq-i == 12 )
Andreas Marek's avatar
Andreas Marek committed
204
205
206
207
        {
                hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 12;
        }
208
209
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
210
        if (nq-i == 24 )
Andreas Marek's avatar
Andreas Marek committed
211
212
213
214
        {
                hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 24;
        }
215
#endif
216
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
217
        if (nq-i == 8 )
Andreas Marek's avatar
Andreas Marek committed
218
219
220
221
        {
                hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
222
223
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
224
        if (nq-i == 16 )
Andreas Marek's avatar
Andreas Marek committed
225
226
227
228
        {
                hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
229
230
231
232
233
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
        if (nq-i == 4 ) {

Andreas Marek's avatar
Andreas Marek committed
234
235
236
                hh_trafo_complex_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 4;
        }
237
238
239
240
241
#endif

#ifdef SINGLE_PRECISION_COMPLEX
        if (nq-i == 8 ) {

Andreas Marek's avatar
Andreas Marek committed
242
243
244
                hh_trafo_complex_kernel_8_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
245
#endif
246
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
247
248
249
250
251
        if (worked_on != nq)
        {
             printf("Error in complex AVX512 BLOCK 2 kernel \n");
             abort();
        }
252
#endif
253
254
255
256
257
258
259
260
261
262
263
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
264
265
266
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
267
268
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
269
270
271
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
272
#endif
Andreas Marek's avatar
Andreas Marek committed
273
274
275
276
277
278
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE y1, y2, y3, y4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
279
280
281
282
283

#ifdef DOUBLE_PRECISION_COMPLEX
       __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
284
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
285
286
#endif

Andreas Marek's avatar
Andreas Marek committed
287
288
289
290
        x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
        x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
        x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
        x4 = _AVX512_LOAD(&q_dbl[(2*ldq)+3*offset]); // q13, q14, q15, q16
291

Andreas Marek's avatar
Andreas Marek committed
292
293
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
294

Andreas Marek's avatar
Andreas Marek committed
295
296
297
298
        y1 = _AVX512_LOAD(&q_dbl[0]);
        y2 = _AVX512_LOAD(&q_dbl[offset]);
        y3 = _AVX512_LOAD(&q_dbl[2*offset]);
        y4 = _AVX512_LOAD(&q_dbl[3*offset]);
299

Andreas Marek's avatar
Andreas Marek committed
300
        tmp1 = _AVX512_MUL(h2_imag, x1);
301

Andreas Marek's avatar
Andreas Marek committed
302
        y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
303

Andreas Marek's avatar
Andreas Marek committed
304
        tmp2 = _AVX512_MUL(h2_imag, x2);
305

Andreas Marek's avatar
Andreas Marek committed
306
        y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
307

Andreas Marek's avatar
Andreas Marek committed
308
        tmp3 = _AVX512_MUL(h2_imag, x3);
309

Andreas Marek's avatar
Andreas Marek committed
310
        y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
311

Andreas Marek's avatar
Andreas Marek committed
312
        tmp4 = _AVX512_MUL(h2_imag, x4);
313

Andreas Marek's avatar
Andreas Marek committed
314
        y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
315

Andreas Marek's avatar
Andreas Marek committed
316
317
318
319
320
321
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
322

Andreas Marek's avatar
Andreas Marek committed
323
324
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
325

Andreas Marek's avatar
Andreas Marek committed
326
                tmp1 = _AVX512_MUL(h1_imag, q1);
327

Andreas Marek's avatar
Andreas Marek committed
328
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
329

Andreas Marek's avatar
Andreas Marek committed
330
                tmp2 = _AVX512_MUL(h1_imag, q2);
331

Andreas Marek's avatar
Andreas Marek committed
332
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
333

Andreas Marek's avatar
Andreas Marek committed
334
                tmp3 = _AVX512_MUL(h1_imag, q3);
335

Andreas Marek's avatar
Andreas Marek committed
336
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
337

Andreas Marek's avatar
Andreas Marek committed
338
                tmp4 = _AVX512_MUL(h1_imag, q4);
339

Andreas Marek's avatar
Andreas Marek committed
340
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
341

Andreas Marek's avatar
Andreas Marek committed
342
343
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
344

Andreas Marek's avatar
Andreas Marek committed
345
                tmp1 = _AVX512_MUL(h2_imag, q1);
346

Andreas Marek's avatar
Andreas Marek committed
347
                y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
348

Andreas Marek's avatar
Andreas Marek committed
349
                tmp2 = _AVX512_MUL(h2_imag, q2);
350

Andreas Marek's avatar
Andreas Marek committed
351
                y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
352

Andreas Marek's avatar
Andreas Marek committed
353
                tmp3 = _AVX512_MUL(h2_imag, q3);
354

Andreas Marek's avatar
Andreas Marek committed
355
                y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
356

Andreas Marek's avatar
Andreas Marek committed
357
                tmp4 = _AVX512_MUL(h2_imag, q4);
358

Andreas Marek's avatar
Andreas Marek committed
359
360
                y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
        }
361

Andreas Marek's avatar
Andreas Marek committed
362
363
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
364

Andreas Marek's avatar
Andreas Marek committed
365
366
367
368
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
369

Andreas Marek's avatar
Andreas Marek committed
370
        tmp1 = _AVX512_MUL(h1_imag, q1);
371

Andreas Marek's avatar
Andreas Marek committed
372
        x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
373

Andreas Marek's avatar
Andreas Marek committed
374
        tmp2 = _AVX512_MUL(h1_imag, q2);
375

Andreas Marek's avatar
Andreas Marek committed
376
        x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
377

Andreas Marek's avatar
Andreas Marek committed
378
        tmp3 = _AVX512_MUL(h1_imag, q3);
379

Andreas Marek's avatar
Andreas Marek committed
380
        x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
381

Andreas Marek's avatar
Andreas Marek committed
382
        tmp4 = _AVX512_MUL(h1_imag, q4);
383

Andreas Marek's avatar
Andreas Marek committed
384
        x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
385

Andreas Marek's avatar
Andreas Marek committed
386
387
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
388

Andreas Marek's avatar
Andreas Marek committed
389
#ifdef HAVE_AVX512_XEON_PHI
390
391
392
393
394
395
396
397
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
398
399
400
401
402
403
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
404
405
406
#endif
	tmp1 = _AVX512_MUL(h1_imag, x1);

Andreas Marek's avatar
Andreas Marek committed
407
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
408

Andreas Marek's avatar
Andreas Marek committed
409
        tmp2 = _AVX512_MUL(h1_imag, x2);
410

Andreas Marek's avatar
Andreas Marek committed
411
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
412

Andreas Marek's avatar
Andreas Marek committed
413
        tmp3 = _AVX512_MUL(h1_imag, x3);
414

Andreas Marek's avatar
Andreas Marek committed
415
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
416

Andreas Marek's avatar
Andreas Marek committed
417
        tmp4 = _AVX512_MUL(h1_imag, x4);
418

Andreas Marek's avatar
Andreas Marek committed
419
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
420

Andreas Marek's avatar
Andreas Marek committed
421
422
423
424
        h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
        h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
425

Andreas Marek's avatar
Andreas Marek committed
426
#ifdef HAVE_AVX512_XEON_PHI
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
444
445
446
447
448
449
450
451
452
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
        h2_real = _AVX512_XOR(h2_real, sign);
        h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
453
454

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
455
456
457
458
        tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0]);
459
460
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
461
        tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
462
#endif
Andreas Marek's avatar
Andreas Marek committed
463
        tmp1 = _AVX512_MUL(h2_imag, tmp2);
464

Andreas Marek's avatar
Andreas Marek committed
465
        tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
466
467
468

        _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);

Andreas Marek's avatar
Andreas Marek committed
469
470
        h2_real = _AVX512_SET1(s_dbl[0]);
        h2_imag = _AVX512_SET1(s_dbl[1]);
471

Andreas Marek's avatar
Andreas Marek committed
472
        tmp1 = _AVX512_MUL(h1_imag, y1);
473

Andreas Marek's avatar
Andreas Marek committed
474
        y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
475

Andreas Marek's avatar
Andreas Marek committed
476
        tmp2 = _AVX512_MUL(h1_imag, y2);
477

Andreas Marek's avatar
Andreas Marek committed
478
        y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
479

Andreas Marek's avatar
Andreas Marek committed
480
        tmp3 = _AVX512_MUL(h1_imag, y3);
481

Andreas Marek's avatar
Andreas Marek committed
482
        y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
483

Andreas Marek's avatar
Andreas Marek committed
484
        tmp4 = _AVX512_MUL(h1_imag, y4);
485

Andreas Marek's avatar
Andreas Marek committed
486
        y4 = _AVX512_FMADDSUB(h1_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
487

Andreas Marek's avatar
Andreas Marek committed
488
        tmp1 = _AVX512_MUL(h2_imag, x1);
489

Andreas Marek's avatar
Andreas Marek committed
490
        y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
491

Andreas Marek's avatar
Andreas Marek committed
492
        tmp2 = _AVX512_MUL(h2_imag, x2);
493

Andreas Marek's avatar
Andreas Marek committed
494
        y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
495

Andreas Marek's avatar
Andreas Marek committed
496
        tmp3 = _AVX512_MUL(h2_imag, x3);
497

Andreas Marek's avatar
Andreas Marek committed
498
        y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
499

Andreas Marek's avatar
Andreas Marek committed
500
        tmp4 = _AVX512_MUL(h2_imag, x4);
501

Andreas Marek's avatar
Andreas Marek committed
502
        y4 = _AVX512_ADD(y4, _AVX512_FMADDSUB(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
503

Andreas Marek's avatar
Andreas Marek committed
504
505
506
507
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
508

Andreas Marek's avatar
Andreas Marek committed
509
510
511
512
        q1 = _AVX512_ADD(q1, y1);
        q2 = _AVX512_ADD(q2, y2);
        q3 = _AVX512_ADD(q3, y3);
        q4 = _AVX512_ADD(q4, y4);
513

Andreas Marek's avatar
Andreas Marek committed
514
515
516
517
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
518

Andreas Marek's avatar
Andreas Marek committed
519
520
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
521

Andreas Marek's avatar
Andreas Marek committed
522
523
524
525
        q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(ldq*2)+3*offset]);
526

Andreas Marek's avatar
Andreas Marek committed
527
528
529
530
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
531

Andreas Marek's avatar
Andreas Marek committed
532
        tmp1 = _AVX512_MUL(h2_imag, y1);
533

Andreas Marek's avatar
Andreas Marek committed
534
        q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
535

Andreas Marek's avatar
Andreas Marek committed
536
        tmp2 = _AVX512_MUL(h2_imag, y2);
537

Andreas Marek's avatar
Andreas Marek committed
538
        q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
539

Andreas Marek's avatar
Andreas Marek committed
540
        tmp3 = _AVX512_MUL(h2_imag, y3);
541

Andreas Marek's avatar
Andreas Marek committed
542
        q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
543

Andreas Marek's avatar
Andreas Marek committed
544
        tmp4 = _AVX512_MUL(h2_imag, y4);
545

Andreas Marek's avatar
Andreas Marek committed
546
        q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
547

Andreas Marek's avatar
Andreas Marek committed
548
549
550
551
        _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
        _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
        _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
        _AVX512_STORE(&q_dbl[(ldq*2)+3*offset], q4);
552

Andreas Marek's avatar
Andreas Marek committed
553
554
555
556
557
558
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
559

Andreas Marek's avatar
Andreas Marek committed
560
561
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
562

Andreas Marek's avatar
Andreas Marek committed
563
                tmp1 = _AVX512_MUL(h1_imag, x1);
564

Andreas Marek's avatar
Andreas Marek committed
565
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
566

Andreas Marek's avatar
Andreas Marek committed
567
                tmp2 = _AVX512_MUL(h1_imag, x2);
568

Andreas Marek's avatar
Andreas Marek committed
569
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
570

Andreas Marek's avatar
Andreas Marek committed
571
                tmp3 = _AVX512_MUL(h1_imag, x3);
572

Andreas Marek's avatar
Andreas Marek committed
573
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
574

Andreas Marek's avatar
Andreas Marek committed
575
                tmp4 = _AVX512_MUL(h1_imag, x4);
576

Andreas Marek's avatar
Andreas Marek committed
577
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
578

Andreas Marek's avatar
Andreas Marek committed
579
580
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
581

Andreas Marek's avatar
Andreas Marek committed
582
                tmp1 = _AVX512_MUL(h2_imag, y1);
583

Andreas Marek's avatar
Andreas Marek committed
584
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
585

Andreas Marek's avatar
Andreas Marek committed
586
                tmp2 = _AVX512_MUL(h2_imag, y2);
587

Andreas Marek's avatar
Andreas Marek committed
588
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
589

Andreas Marek's avatar
Andreas Marek committed
590
                tmp3 = _AVX512_MUL(h2_imag, y3);
591

Andreas Marek's avatar
Andreas Marek committed
592
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
593

Andreas Marek's avatar
Andreas Marek committed
594
                tmp4 = _AVX512_MUL(h2_imag, y4);
595

Andreas Marek's avatar
Andreas Marek committed
596
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
597

Andreas Marek's avatar
Andreas Marek committed
598
599
600
601
602
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
603

Andreas Marek's avatar
Andreas Marek committed
604
605
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
606

Andreas Marek's avatar
Andreas Marek committed
607
608
609
610
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
611

Andreas Marek's avatar
Andreas Marek committed
612
        tmp1 = _AVX512_MUL(h1_imag, x1);
613

Andreas Marek's avatar
Andreas Marek committed
614
        q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
615

Andreas Marek's avatar
Andreas Marek committed
616
        tmp2 = _AVX512_MUL(h1_imag, x2);
617

Andreas Marek's avatar
Andreas Marek committed
618
        q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
619

Andreas Marek's avatar
Andreas Marek committed
620
        tmp3 = _AVX512_MUL(h1_imag, x3);
621

Andreas Marek's avatar
Andreas Marek committed
622
        q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
623

Andreas Marek's avatar
Andreas Marek committed
624
        tmp4 = _AVX512_MUL(h1_imag, x4);
625

Andreas Marek's avatar
Andreas Marek committed
626
        q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
627

Andreas Marek's avatar
Andreas Marek committed
628
629
630
631
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
632
633
}

634

635
#ifdef DOUBLE_PRECISION_COMPLEX
636
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
637
638
#endif
#ifdef SINGLE_PRECISION_COMPLEX
639
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
640
641
642
643
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
644
645
646
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
647
648
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
649
650
651
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
652
#endif
Andreas Marek's avatar
Andreas Marek committed
653
654
655
656
657
658
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE y1, y2, y3, y4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
659
660
661
662
663

#ifdef DOUBLE_PRECISION_COMPLEX
       __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
664
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
665
666
#endif

Andreas Marek's avatar
Andreas Marek committed
667
668
669
        x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
        x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
        x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
670

Andreas Marek's avatar
Andreas Marek committed
671
672
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
673

Andreas Marek's avatar
Andreas Marek committed
674
675
676
        y1 = _AVX512_LOAD(&q_dbl[0]);
        y2 = _AVX512_LOAD(&q_dbl[offset]);
        y3 = _AVX512_LOAD(&q_dbl[2*offset]);
677

Andreas Marek's avatar
Andreas Marek committed
678
        tmp1 = _AVX512_MUL(h2_imag, x1);
679

Andreas Marek's avatar
Andreas Marek committed
680
        y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
681

Andreas Marek's avatar
Andreas Marek committed
682
        tmp2 = _AVX512_MUL(h2_imag, x2);
683

Andreas Marek's avatar
Andreas Marek committed
684
        y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
685

Andreas Marek's avatar
Andreas Marek committed
686
        tmp3 = _AVX512_MUL(h2_imag, x3);
687

Andreas Marek's avatar
Andreas Marek committed
688
        y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
689

Andreas Marek's avatar
Andreas Marek committed
690
691
692
693
694
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
695

Andreas Marek's avatar
Andreas Marek committed
696
697
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
698

Andreas Marek's avatar
Andreas Marek committed
699
                tmp1 = _AVX512_MUL(h1_imag, q1);
700

Andreas Marek's avatar
Andreas Marek committed
701
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
702

Andreas Marek's avatar
Andreas Marek committed
703
                tmp2 = _AVX512_MUL(h1_imag, q2);
704

Andreas Marek's avatar
Andreas Marek committed
705
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
706

Andreas Marek's avatar
Andreas Marek committed
707
                tmp3 = _AVX512_MUL(h1_imag, q3);
708

Andreas Marek's avatar
Andreas Marek committed
709
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
710

Andreas Marek's avatar
Andreas Marek committed
711
712
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
713

Andreas Marek's avatar
Andreas Marek committed
714
                tmp1 = _AVX512_MUL(h2_imag, q1);
715

Andreas Marek's avatar
Andreas Marek committed
716
                y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
717

Andreas Marek's avatar
Andreas Marek committed
718
                tmp2 = _AVX512_MUL(h2_imag, q2);
719

Andreas Marek's avatar
Andreas Marek committed
720
                y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
721

Andreas Marek's avatar
Andreas Marek committed
722
                tmp3 = _AVX512_MUL(h2_imag, q3);
723

Andreas Marek's avatar
Andreas Marek committed
724
                y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
725

Andreas Marek's avatar
Andreas Marek committed
726
        }
727

Andreas Marek's avatar
Andreas Marek committed
728
729
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
730

Andreas Marek's avatar
Andreas Marek committed
731
732
733
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
734

Andreas Marek's avatar
Andreas Marek committed
735
        tmp1 = _AVX512_MUL(h1_imag, q1);
736

Andreas Marek's avatar
Andreas Marek committed
737
        x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
738

Andreas Marek's avatar
Andreas Marek committed
739
        tmp2 = _AVX512_MUL(h1_imag, q2);
740

Andreas Marek's avatar
Andreas Marek committed
741
        x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
742

Andreas Marek's avatar
Andreas Marek committed
743
        tmp3 = _AVX512_MUL(h1_imag, q3);
744

Andreas Marek's avatar
Andreas Marek committed
745
        x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
746

Andreas Marek's avatar
Andreas Marek committed
747
748
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
749

Andreas Marek's avatar
Andreas Marek committed
750
#ifdef HAVE_AVX512_XEON_PHI
751
752
753
754
755
756
757
758
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
759
760
761
762
763
764
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
765
766
767
#endif
	tmp1 = _AVX512_MUL(h1_imag, x1);

Andreas Marek's avatar
Andreas Marek committed
768
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
769

Andreas Marek's avatar
Andreas Marek committed
770
        tmp2 = _AVX512_MUL(h1_imag, x2);
771

Andreas Marek's avatar
Andreas Marek committed
772
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
773

Andreas Marek's avatar
Andreas Marek committed
774
        tmp3 = _AVX512_MUL(h1_imag, x3);
775

Andreas Marek's avatar
Andreas Marek committed
776
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
777

Andreas Marek's avatar
Andreas Marek committed
778
779
780
781
        h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
        h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
782

Andreas Marek's avatar
Andreas Marek committed
783
#ifdef HAVE_AVX512_XEON_PHI
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
801
802
803
804
805
806
807
808
809
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
        h2_real = _AVX512_XOR(h2_real, sign);
        h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
810
811

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
812
813
814
815
        tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0]);
816
817
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
818
        tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
819
#endif
Andreas Marek's avatar
Andreas Marek committed
820
        tmp1 = _AVX512_MUL(h2_imag, tmp2);
821

Andreas Marek's avatar
Andreas Marek committed
822
        tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
823
824
825

        _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);

Andreas Marek's avatar
Andreas Marek committed
826
827
        h2_real = _AVX512_SET1(s_dbl[0]);
        h2_imag = _AVX512_SET1(s_dbl[1]);
828

Andreas Marek's avatar
Andreas Marek committed
829
        tmp1 = _AVX512_MUL(h1_imag, y1);
830

Andreas Marek's avatar
Andreas Marek committed
831
        y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
832

Andreas Marek's avatar
Andreas Marek committed
833
        tmp2 = _AVX512_MUL(h1_imag, y2);
834

Andreas Marek's avatar
Andreas Marek committed
835
        y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
836

Andreas Marek's avatar
Andreas Marek committed
837
        tmp3 = _AVX512_MUL(h1_imag, y3);
838

Andreas Marek's avatar
Andreas Marek committed
839
        y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
840

Andreas Marek's avatar
Andreas Marek committed
841
        tmp1 = _AVX512_MUL(h2_imag, x1);
842

Andreas Marek's avatar
Andreas Marek committed
843
        y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
844

Andreas Marek's avatar
Andreas Marek committed
845
        tmp2 = _AVX512_MUL(h2_imag, x2);
846

Andreas Marek's avatar
Andreas Marek committed
847
        y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
848

Andreas Marek's avatar
Andreas Marek committed
849
        tmp3 = _AVX512_MUL(h2_imag, x3);
850

Andreas Marek's avatar
Andreas Marek committed
851
        y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
852

Andreas Marek's avatar
Andreas Marek committed
853
854
855
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
856

Andreas Marek's avatar
Andreas Marek committed
857
858
859
        q1 = _AVX512_ADD(q1, y1);
        q2 = _AVX512_ADD(q2, y2);
        q3 = _AVX512_ADD(q3, y3);
860

Andreas Marek's avatar
Andreas Marek committed
861
862
863
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
864

Andreas Marek's avatar
Andreas Marek committed
865
866
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
867

Andreas Marek's avatar
Andreas Marek committed
868
869
870
        q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
871

Andreas Marek's avatar
Andreas Marek committed
872
873
874
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
875

Andreas Marek's avatar
Andreas Marek committed
876
        tmp1 = _AVX512_MUL(h2_imag, y1);
877

Andreas Marek's avatar
Andreas Marek committed
878
        q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
879

Andreas Marek's avatar
Andreas Marek committed
880
        tmp2 = _AVX512_MUL(h2_imag, y2);
881

Andreas Marek's avatar
Andreas Marek committed
882
        q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
883

Andreas Marek's avatar
Andreas Marek committed
884
        tmp3 = _AVX512_MUL(h2_imag, y3);
885

Andreas Marek's avatar
Andreas Marek committed
886
        q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
887

Andreas Marek's avatar
Andreas Marek committed
888
889
890
        _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
        _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
        _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
891

Andreas Marek's avatar
Andreas Marek committed
892
893
894
895
896
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);