complex_avx512_2hv_template.c 50.8 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
XEON_PHI/    This file is part of ELPA.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
51
#include <stdio.h>
Andreas Marek's avatar
Andreas Marek committed
52
#include <stdlib.h>
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8

#define __AVX512_DATATYPE __m512d
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_SET _mm512_set_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
Andreas Marek's avatar
Andreas Marek committed
68 69 70
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
#define _SHUFFLE 0x55

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
#define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16

#define __AVX512_DATATYPE __m512
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_SET _mm512_set_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
Andreas Marek's avatar
Andreas Marek committed
96 97 98
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
#define _SHUFFLE 0xb1

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
#define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */

//Forward declaration
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
115
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
116
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
117
static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
118 119 120 121
#endif

#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
122
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
123
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
124
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
#endif

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx512_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     ! complex(kind=c_float_complex)     :: q(*)
!f>     type(c_ptr), value                  :: q
!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef DOUBLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx512_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
164 165 166 167 168 169
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
170

Andreas Marek's avatar
Andreas Marek committed
171
        worked_on = 0;
172 173

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
174
        double complex s = conj(hh[(ldh)+1])*1.0;
175 176
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
177
        float complex s = conj(hh[(ldh)+1])*1.0f;
178
#endif
Andreas Marek's avatar
Andreas Marek committed
179 180 181 182
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * conj(hh[(i+ldh)]);
        }
183 184

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
185 186 187 188 189
        for (i = 0; i < nq-12; i+=16)
        {
                hh_trafo_complex_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
190 191
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
192 193 194 195
        for (i = 0; i < nq-24; i+=32)
        {
                hh_trafo_complex_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 32;
196

Andreas Marek's avatar
Andreas Marek committed
197
        }
198
#endif
Andreas Marek's avatar
Andreas Marek committed
199 200 201
        if (nq-i == 0) {
                return;
        }
202
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
203
        if (nq-i == 12 )
Andreas Marek's avatar
Andreas Marek committed
204 205 206 207
        {
                hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 12;
        }
208 209
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
210
        if (nq-i == 24 )
Andreas Marek's avatar
Andreas Marek committed
211 212 213 214
        {
                hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 24;
        }
215
#endif
216
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
217
        if (nq-i == 8 )
Andreas Marek's avatar
Andreas Marek committed
218 219 220 221
        {
                hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
222 223
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
224
        if (nq-i == 16 )
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228
        {
                hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
229 230 231 232 233
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
        if (nq-i == 4 ) {

Andreas Marek's avatar
Andreas Marek committed
234 235 236
                hh_trafo_complex_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 4;
        }
237 238 239 240 241
#endif

#ifdef SINGLE_PRECISION_COMPLEX
        if (nq-i == 8 ) {

Andreas Marek's avatar
Andreas Marek committed
242 243 244
                hh_trafo_complex_kernel_8_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
245
#endif
246
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
247 248 249 250 251
        if (worked_on != nq)
        {
             printf("Error in complex AVX512 BLOCK 2 kernel \n");
             abort();
        }
252
#endif
253 254 255 256 257 258 259 260 261 262 263
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
264 265 266
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
267 268
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
269 270 271
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
272
#endif
Andreas Marek's avatar
Andreas Marek committed
273 274 275 276 277 278
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE y1, y2, y3, y4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
279 280 281 282 283

#ifdef DOUBLE_PRECISION_COMPLEX
       __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
284
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
285 286
#endif

Andreas Marek's avatar
Andreas Marek committed
287 288 289 290
        x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
        x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
        x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
        x4 = _AVX512_LOAD(&q_dbl[(2*ldq)+3*offset]); // q13, q14, q15, q16
291

Andreas Marek's avatar
Andreas Marek committed
292 293
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
294

Andreas Marek's avatar
Andreas Marek committed
295 296 297 298
        y1 = _AVX512_LOAD(&q_dbl[0]);
        y2 = _AVX512_LOAD(&q_dbl[offset]);
        y3 = _AVX512_LOAD(&q_dbl[2*offset]);
        y4 = _AVX512_LOAD(&q_dbl[3*offset]);
299

Andreas Marek's avatar
Andreas Marek committed
300
        tmp1 = _AVX512_MUL(h2_imag, x1);
301

Andreas Marek's avatar
Andreas Marek committed
302
        y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
303

Andreas Marek's avatar
Andreas Marek committed
304
        tmp2 = _AVX512_MUL(h2_imag, x2);
305

Andreas Marek's avatar
Andreas Marek committed
306
        y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
307

Andreas Marek's avatar
Andreas Marek committed
308
        tmp3 = _AVX512_MUL(h2_imag, x3);
309

Andreas Marek's avatar
Andreas Marek committed
310
        y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
311

Andreas Marek's avatar
Andreas Marek committed
312
        tmp4 = _AVX512_MUL(h2_imag, x4);
313

Andreas Marek's avatar
Andreas Marek committed
314
        y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
315

Andreas Marek's avatar
Andreas Marek committed
316 317 318 319 320 321
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
322

Andreas Marek's avatar
Andreas Marek committed
323 324
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
325

Andreas Marek's avatar
Andreas Marek committed
326
                tmp1 = _AVX512_MUL(h1_imag, q1);
327

Andreas Marek's avatar
Andreas Marek committed
328
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
329

Andreas Marek's avatar
Andreas Marek committed
330
                tmp2 = _AVX512_MUL(h1_imag, q2);
331

Andreas Marek's avatar
Andreas Marek committed
332
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
333

Andreas Marek's avatar
Andreas Marek committed
334
                tmp3 = _AVX512_MUL(h1_imag, q3);
335

Andreas Marek's avatar
Andreas Marek committed
336
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
337

Andreas Marek's avatar
Andreas Marek committed
338
                tmp4 = _AVX512_MUL(h1_imag, q4);
339

Andreas Marek's avatar
Andreas Marek committed
340
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
341

Andreas Marek's avatar
Andreas Marek committed
342 343
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
344

Andreas Marek's avatar
Andreas Marek committed
345
                tmp1 = _AVX512_MUL(h2_imag, q1);
346

Andreas Marek's avatar
Andreas Marek committed
347
                y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
348

Andreas Marek's avatar
Andreas Marek committed
349
                tmp2 = _AVX512_MUL(h2_imag, q2);
350

Andreas Marek's avatar
Andreas Marek committed
351
                y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
352

Andreas Marek's avatar
Andreas Marek committed
353
                tmp3 = _AVX512_MUL(h2_imag, q3);
354

Andreas Marek's avatar
Andreas Marek committed
355
                y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
356

Andreas Marek's avatar
Andreas Marek committed
357
                tmp4 = _AVX512_MUL(h2_imag, q4);
358

Andreas Marek's avatar
Andreas Marek committed
359 360
                y4 = _AVX512_ADD(y4, _AVX512_FMSUBADD(h2_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
        }
361

Andreas Marek's avatar
Andreas Marek committed
362 363
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
364

Andreas Marek's avatar
Andreas Marek committed
365 366 367 368
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
369

Andreas Marek's avatar
Andreas Marek committed
370
        tmp1 = _AVX512_MUL(h1_imag, q1);
371

Andreas Marek's avatar
Andreas Marek committed
372
        x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
373

Andreas Marek's avatar
Andreas Marek committed
374
        tmp2 = _AVX512_MUL(h1_imag, q2);
375

Andreas Marek's avatar
Andreas Marek committed
376
        x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
377

Andreas Marek's avatar
Andreas Marek committed
378
        tmp3 = _AVX512_MUL(h1_imag, q3);
379

Andreas Marek's avatar
Andreas Marek committed
380
        x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
381

Andreas Marek's avatar
Andreas Marek committed
382
        tmp4 = _AVX512_MUL(h1_imag, q4);
383

Andreas Marek's avatar
Andreas Marek committed
384
        x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
385

Andreas Marek's avatar
Andreas Marek committed
386 387
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
388

Andreas Marek's avatar
Andreas Marek committed
389
#ifdef HAVE_AVX512_XEON_PHI
390 391 392 393 394 395 396 397
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
398 399 400 401 402 403
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
404 405 406
#endif
	tmp1 = _AVX512_MUL(h1_imag, x1);

Andreas Marek's avatar
Andreas Marek committed
407
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
408

Andreas Marek's avatar
Andreas Marek committed
409
        tmp2 = _AVX512_MUL(h1_imag, x2);
410

Andreas Marek's avatar
Andreas Marek committed
411
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
412

Andreas Marek's avatar
Andreas Marek committed
413
        tmp3 = _AVX512_MUL(h1_imag, x3);
414

Andreas Marek's avatar
Andreas Marek committed
415
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
416

Andreas Marek's avatar
Andreas Marek committed
417
        tmp4 = _AVX512_MUL(h1_imag, x4);
418

Andreas Marek's avatar
Andreas Marek committed
419
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
420

Andreas Marek's avatar
Andreas Marek committed
421 422 423 424
        h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
        h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
        h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
425

Andreas Marek's avatar
Andreas Marek committed
426
#ifdef HAVE_AVX512_XEON_PHI
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
        h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
444 445 446 447 448 449 450 451 452
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
        h2_real = _AVX512_XOR(h2_real, sign);
        h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
453 454

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
455 456 457 458
        tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0]);
459 460
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
461
        tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
462
#endif
Andreas Marek's avatar
Andreas Marek committed
463
        tmp1 = _AVX512_MUL(h2_imag, tmp2);
464

Andreas Marek's avatar
Andreas Marek committed
465
        tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
466 467 468

        _AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);

Andreas Marek's avatar
Andreas Marek committed
469 470
        h2_real = _AVX512_SET1(s_dbl[0]);
        h2_imag = _AVX512_SET1(s_dbl[1]);
471

Andreas Marek's avatar
Andreas Marek committed
472
        tmp1 = _AVX512_MUL(h1_imag, y1);
473

Andreas Marek's avatar
Andreas Marek committed
474
        y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
475

Andreas Marek's avatar
Andreas Marek committed
476
        tmp2 = _AVX512_MUL(h1_imag, y2);
477

Andreas Marek's avatar
Andreas Marek committed
478
        y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
479

Andreas Marek's avatar
Andreas Marek committed
480
        tmp3 = _AVX512_MUL(h1_imag, y3);
481

Andreas Marek's avatar
Andreas Marek committed
482
        y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
483

Andreas Marek's avatar
Andreas Marek committed
484
        tmp4 = _AVX512_MUL(h1_imag, y4);
485

Andreas Marek's avatar
Andreas Marek committed
486
        y4 = _AVX512_FMADDSUB(h1_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
487

Andreas Marek's avatar
Andreas Marek committed
488
        tmp1 = _AVX512_MUL(h2_imag, x1);
489

Andreas Marek's avatar
Andreas Marek committed
490
        y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
491

Andreas Marek's avatar
Andreas Marek committed
492
        tmp2 = _AVX512_MUL(h2_imag, x2);
493

Andreas Marek's avatar
Andreas Marek committed
494
        y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
495

Andreas Marek's avatar
Andreas Marek committed
496
        tmp3 = _AVX512_MUL(h2_imag, x3);
497

Andreas Marek's avatar
Andreas Marek committed
498
        y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
499

Andreas Marek's avatar
Andreas Marek committed
500
        tmp4 = _AVX512_MUL(h2_imag, x4);
501

Andreas Marek's avatar
Andreas Marek committed
502
        y4 = _AVX512_ADD(y4, _AVX512_FMADDSUB(h2_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
503

Andreas Marek's avatar
Andreas Marek committed
504 505 506 507
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
508

Andreas Marek's avatar
Andreas Marek committed
509 510 511 512
        q1 = _AVX512_ADD(q1, y1);
        q2 = _AVX512_ADD(q2, y2);
        q3 = _AVX512_ADD(q3, y3);
        q4 = _AVX512_ADD(q4, y4);
513

Andreas Marek's avatar
Andreas Marek committed
514 515 516 517
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
518

Andreas Marek's avatar
Andreas Marek committed
519 520
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
521

Andreas Marek's avatar
Andreas Marek committed
522 523 524 525
        q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(ldq*2)+3*offset]);
526

Andreas Marek's avatar
Andreas Marek committed
527 528 529 530
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
531

Andreas Marek's avatar
Andreas Marek committed
532
        tmp1 = _AVX512_MUL(h2_imag, y1);
533

Andreas Marek's avatar
Andreas Marek committed
534
        q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
535

Andreas Marek's avatar
Andreas Marek committed
536
        tmp2 = _AVX512_MUL(h2_imag, y2);
537

Andreas Marek's avatar
Andreas Marek committed
538
        q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
539

Andreas Marek's avatar
Andreas Marek committed
540
        tmp3 = _AVX512_MUL(h2_imag, y3);
541

Andreas Marek's avatar
Andreas Marek committed
542
        q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
543

Andreas Marek's avatar
Andreas Marek committed
544
        tmp4 = _AVX512_MUL(h2_imag, y4);
545

Andreas Marek's avatar
Andreas Marek committed
546
        q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
547

Andreas Marek's avatar
Andreas Marek committed
548 549 550 551
        _AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
        _AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
        _AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
        _AVX512_STORE(&q_dbl[(ldq*2)+3*offset], q4);
552

Andreas Marek's avatar
Andreas Marek committed
553 554 555 556 557 558
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
559

Andreas Marek's avatar
Andreas Marek committed
560 561
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
562

Andreas Marek's avatar
Andreas Marek committed
563
                tmp1 = _AVX512_MUL(h1_imag, x1);
564

Andreas Marek's avatar
Andreas Marek committed
565
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
566

Andreas Marek's avatar
Andreas Marek committed
567
                tmp2 = _AVX512_MUL(h1_imag, x2);
568

Andreas Marek's avatar
Andreas Marek committed
569
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
570

Andreas Marek's avatar
Andreas Marek committed
571
                tmp3 = _AVX512_MUL(h1_imag, x3);
572

Andreas Marek's avatar
Andreas Marek committed
573
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
574

Andreas Marek's avatar
Andreas Marek committed
575
                tmp4 = _AVX512_MUL(h1_imag, x4);
576

Andreas Marek's avatar
Andreas Marek committed
577
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
578

Andreas Marek's avatar
Andreas Marek committed
579 580
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
581

Andreas Marek's avatar
Andreas Marek committed
582
                tmp1 = _AVX512_MUL(h2_imag, y1);
583

Andreas Marek's avatar
Andreas Marek committed
584
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
585

Andreas Marek's avatar
Andreas Marek committed
586
                tmp2 = _AVX512_MUL(h2_imag, y2);
587

Andreas Marek's avatar
Andreas Marek committed
588
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
589

Andreas Marek's avatar
Andreas Marek committed
590
                tmp3 = _AVX512_MUL(h2_imag, y3);
591

Andreas Marek's avatar
Andreas Marek committed
592
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
593

Andreas Marek's avatar
Andreas Marek committed
594
                tmp4 = _AVX512_MUL(h2_imag, y4);
595

Andreas Marek's avatar
Andreas Marek committed
596
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h2_real, y4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
597

Andreas Marek's avatar
Andreas Marek committed
598 599 600 601 602
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
603

Andreas Marek's avatar
Andreas Marek committed
604 605
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
606

Andreas Marek's avatar
Andreas Marek committed
607 608 609 610
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
611

Andreas Marek's avatar
Andreas Marek committed
612
        tmp1 = _AVX512_MUL(h1_imag, x1);
613

Andreas Marek's avatar
Andreas Marek committed
614
        q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
615

Andreas Marek's avatar
Andreas Marek committed
616
        tmp2 = _AVX512_MUL(h1_imag, x2);
617

Andreas Marek's avatar
Andreas Marek committed
618
        q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
619

Andreas Marek's avatar
Andreas Marek committed
620
        tmp3 = _AVX512_MUL(h1_imag, x3);
621

Andreas Marek's avatar
Andreas Marek committed
622
        q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
623

Andreas Marek's avatar
Andreas Marek committed
624
        tmp4 = _AVX512_MUL(h1_imag, x4);
625

Andreas Marek's avatar
Andreas Marek committed
626
        q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
627

Andreas Marek's avatar
Andreas Marek committed
628 629 630 631
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
        _AVX512_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
632 633
}

634

635
#ifdef DOUBLE_PRECISION_COMPLEX
636
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
637 638
#endif
#ifdef SINGLE_PRECISION_COMPLEX
639
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
640 641 642 643
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
644 645 646
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
647 648
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
649 650 651
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
652
#endif
Andreas Marek's avatar
Andreas Marek committed
653 654 655 656 657 658
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE y1, y2, y3, y4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
659 660 661 662 663

#ifdef DOUBLE_PRECISION_COMPLEX
       __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
664
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
665 666
#endif

Andreas Marek's avatar
Andreas Marek committed
667 668 669
        x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);  // q1, q2, q3, q4
        x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);  // q5, q6, q7, q8
        x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
670

Andreas Marek's avatar
Andreas Marek committed
671 672
        h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
673

Andreas Marek's avatar
Andreas Marek committed
674 675 676
        y1 = _AVX512_LOAD(&q_dbl[0]);
        y2 = _AVX512_LOAD(&q_dbl[offset]);
        y3 = _AVX512_LOAD(&q_dbl[2*offset]);
677

Andreas Marek's avatar
Andreas Marek committed
678
        tmp1 = _AVX512_MUL(h2_imag, x1);
679

Andreas Marek's avatar
Andreas Marek committed
680
        y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
681

Andreas Marek's avatar
Andreas Marek committed
682
        tmp2 = _AVX512_MUL(h2_imag, x2);
683

Andreas Marek's avatar
Andreas Marek committed
684
        y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
685

Andreas Marek's avatar
Andreas Marek committed
686
        tmp3 = _AVX512_MUL(h2_imag, x3);
687

Andreas Marek's avatar
Andreas Marek committed
688
        y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
689

Andreas Marek's avatar
Andreas Marek committed
690 691 692 693 694
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
695

Andreas Marek's avatar
Andreas Marek committed
696 697
                h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
                h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
698

Andreas Marek's avatar
Andreas Marek committed
699
                tmp1 = _AVX512_MUL(h1_imag, q1);
700

Andreas Marek's avatar
Andreas Marek committed
701
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
702

Andreas Marek's avatar
Andreas Marek committed
703
                tmp2 = _AVX512_MUL(h1_imag, q2);
704

Andreas Marek's avatar
Andreas Marek committed
705
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
706

Andreas Marek's avatar
Andreas Marek committed
707
                tmp3 = _AVX512_MUL(h1_imag, q3);
708

Andreas Marek's avatar
Andreas Marek committed
709
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
710

Andreas Marek's avatar
Andreas Marek committed
711 712
                h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
713

Andreas Marek's avatar
Andreas Marek committed
714
                tmp1 = _AVX512_MUL(h2_imag, q1);
715

Andreas Marek's avatar
Andreas Marek committed
716
                y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
717

Andreas Marek's avatar
Andreas Marek committed
718
                tmp2 = _AVX512_MUL(h2_imag, q2);
719

Andreas Marek's avatar
Andreas Marek committed
720
                y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
721

Andreas Marek's avatar
Andreas Marek committed
722
                tmp3 = _AVX512_MUL(h2_imag, q3);
723

Andreas Marek's avatar
Andreas Marek committed
724
                y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
725

Andreas Marek's avatar
Andreas Marek committed
726
        }
727

Andreas Marek's avatar
Andreas Marek committed
728 729
        h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
        h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
730

Andreas Marek's avatar
Andreas Marek committed
731 732 733
        q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
734

Andreas Marek's avatar
Andreas Marek committed
735
        tmp1 = _AVX512_MUL(h1_imag, q1);
736

Andreas Marek's avatar
Andreas Marek committed
737
        x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
738

Andreas Marek's avatar
Andreas Marek committed
739
        tmp2 = _AVX512_MUL(h1_imag, q2);
740

Andreas Marek's avatar
Andreas Marek committed
741
        x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
742

Andreas Marek's avatar
Andreas Marek committed
743
        tmp3 = _AVX512_MUL(h1_imag, q3);
744

Andreas Marek's avatar
Andreas Marek committed
745
        x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
746

Andreas Marek's avatar
Andreas Marek committed
747 748
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
749

Andreas Marek's avatar
Andreas Marek committed
750
#ifdef HAVE_AVX512_XEON_PHI
751 752 753 754 755 756 757 758
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
759 760 761 762 763 764
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
765 766 767
#endif
	tmp1 = _AVX512_MUL(h1_imag, x1);

Andreas Marek's avatar
Andreas Marek committed
768
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
769

Andreas Marek's avatar
Andreas Marek committed
770
        tmp2 = _AVX512_MUL(h1_imag, x2);
771

Andreas Marek's avatar
Andreas Marek committed
772
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(