complex_avx512_1hv_template.c 39.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------


#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
53
#include <stdio.h>
Andreas Marek's avatar
Andreas Marek committed
54
#include <stdlib.h>
55 56 57 58 59 60 61 62 63 64 65

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define __AVX512_DATATYPE __m512d
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
Andreas Marek's avatar
Andreas Marek committed
66
#ifdef HAVE_AVX512_XEON
67
#define _AVX512_XOR _mm512_xor_pd
Andreas Marek's avatar
Andreas Marek committed
68
#endif
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)

#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
#define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define __AVX512_DATATYPE __m512
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
Andreas Marek's avatar
Andreas Marek committed
92
#ifdef HAVE_AVX512_XEON
93
#define _AVX512_XOR _mm512_xor_ps
Andreas Marek's avatar
Andreas Marek committed
94
#endif
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)

#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
#define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */


//Forward declaration
#ifdef DOUBLE_PRECISION_COMPLEX
static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
114
static  __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
115
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
116
static  __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
117
static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
118
static  __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
119 120 121 122
#endif

#ifdef SINGLE_PRECISION_COMPLEX
static  __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
123
static  __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
124
static  __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
125
static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
126
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
127
static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
#endif


/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                 :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_float_complex)     :: q(*)
!f>     type(c_ptr), value                  :: q
!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef DOUBLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
167 168 169 170 171 172
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int worked_on;
        //int ldh = *pldh;
173

Andreas Marek's avatar
Andreas Marek committed
174
        worked_on = 0;
175

176
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
177 178 179 180 181
        for (i = 0; i < nq-20; i+=24)
        {
                hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 24;
        }
182
#endif
183

184
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
185 186 187 188 189
        for (i = 0; i < nq-40; i+=48)
        {
                hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 48;
        }
190
#endif
Andreas Marek's avatar
Andreas Marek committed
191 192 193 194
        if (nq == i)
        {
                return;
        }
195 196

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
197 198 199 200 201
        if (nq-i == 20)
        {
                hh_trafo_complex_kernel_20_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 20;
        }
202 203 204
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
205 206 207 208 209
        if (nq-i == 40)
        {
                hh_trafo_complex_kernel_40_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 40;
        }
210 211
#endif

212
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
213 214 215 216 217
        if (nq-i == 16)
        {
                hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 16;
        }
218
#endif
219

220
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
221 222 223 224 225
        if (nq-i == 32)
        {
                hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 32;
        }
226
#endif
227 228

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
229 230 231 232 233
        if (nq-i == 12)
        {
                hh_trafo_complex_kernel_12_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 12;
        }
234 235 236
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
237 238 239 240 241
        if (nq-i == 24)
        {
                hh_trafo_complex_kernel_24_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 24;
        }
242 243
#endif

244
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
245 246 247 248 249
        if (nq-i == 8)
        {
                hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 8;
        }
250
#endif
251

252
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
253 254 255 256 257
        if (nq-i == 16)
        {
                hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 16;
        }
258
#endif
259 260

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
261 262 263 264 265
        if (nq-i == 4)
        {
                hh_trafo_complex_kernel_4_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 4;
        }
266 267 268
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
269 270 271 272 273
        if (nq-i == 8)
        {
                hh_trafo_complex_kernel_8_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 8;
        }
274
#endif
275
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
276 277 278 279 280
        if (worked_on != nq)
        {
             printf("Error in complex AVX512 BLOCK 1 kernel \n");
             abort();
        }
281
#endif
282 283 284 285 286 287 288 289 290 291 292
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
293 294
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
295 296
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
297 298
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
299
#endif
Andreas Marek's avatar
Andreas Marek committed
300 301 302 303 304
        __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
305 306

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
307
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
308 309
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
310
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
311 312 313 314 315 316 317 318 319 320
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16
#endif


Andreas Marek's avatar
Andreas Marek committed
321 322 323 324 325 326
        x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
        x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
        x6 = _AVX512_LOAD(&q_dbl[5*offset]);   // complex 21, 22, 23, 24
327

Andreas Marek's avatar
Andreas Marek committed
328 329 330 331
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
332

Andreas Marek's avatar
Andreas Marek committed
333 334 335 336 337 338
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
339

Andreas Marek's avatar
Andreas Marek committed
340
                tmp1 = _AVX512_MUL(h1_imag, q1);
341

Andreas Marek's avatar
Andreas Marek committed
342
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
343

Andreas Marek's avatar
Andreas Marek committed
344
                tmp2 = _AVX512_MUL(h1_imag, q2);
345

Andreas Marek's avatar
Andreas Marek committed
346
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
347

Andreas Marek's avatar
Andreas Marek committed
348
                tmp3 = _AVX512_MUL(h1_imag, q3);
349

Andreas Marek's avatar
Andreas Marek committed
350
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
351

Andreas Marek's avatar
Andreas Marek committed
352
                tmp4 = _AVX512_MUL(h1_imag, q4);
353

Andreas Marek's avatar
Andreas Marek committed
354
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
355

Andreas Marek's avatar
Andreas Marek committed
356
                tmp5 = _AVX512_MUL(h1_imag, q5);
357

Andreas Marek's avatar
Andreas Marek committed
358
                x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
359

Andreas Marek's avatar
Andreas Marek committed
360
                tmp6 = _AVX512_MUL(h1_imag, q6);
361

Andreas Marek's avatar
Andreas Marek committed
362 363
                x6 = _AVX512_ADD(x6, _AVX512_FMSUBADD(h1_real, q6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
        }
364

Andreas Marek's avatar
Andreas Marek committed
365 366
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
367

Andreas Marek's avatar
Andreas Marek committed
368
#ifdef HAVE_AVX512_XEON_PHI
369 370 371 372 373 374 375
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
Andreas Marek's avatar
Andreas Marek committed
376 377 378 379 380 381 382
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
383 384
#endif

Andreas Marek's avatar
Andreas Marek committed
385
        tmp1 = _AVX512_MUL(h1_imag, x1);
386

Andreas Marek's avatar
Andreas Marek committed
387
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
388

Andreas Marek's avatar
Andreas Marek committed
389
        tmp2 = _AVX512_MUL(h1_imag, x2);
390

Andreas Marek's avatar
Andreas Marek committed
391
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
392

Andreas Marek's avatar
Andreas Marek committed
393
        tmp3 = _AVX512_MUL(h1_imag, x3);
394

Andreas Marek's avatar
Andreas Marek committed
395
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
396

Andreas Marek's avatar
Andreas Marek committed
397
        tmp4 = _AVX512_MUL(h1_imag, x4);
398

Andreas Marek's avatar
Andreas Marek committed
399
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
400

Andreas Marek's avatar
Andreas Marek committed
401
        tmp5 = _AVX512_MUL(h1_imag, x5);
402

Andreas Marek's avatar
Andreas Marek committed
403
        x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
404

Andreas Marek's avatar
Andreas Marek committed
405
        tmp6 = _AVX512_MUL(h1_imag, x6);
406

Andreas Marek's avatar
Andreas Marek committed
407
        x6 = _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE));
408

Andreas Marek's avatar
Andreas Marek committed
409 410 411 412 413 414
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
        q5 = _AVX512_LOAD(&q_dbl[4*offset]);
        q6 = _AVX512_LOAD(&q_dbl[5*offset]);
415

Andreas Marek's avatar
Andreas Marek committed
416 417 418 419 420 421
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
        q5 = _AVX512_ADD(q5, x5);
        q6 = _AVX512_ADD(q6, x6);
422

Andreas Marek's avatar
Andreas Marek committed
423 424 425 426 427 428
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
        _AVX512_STORE(&q_dbl[4*offset], q5);
        _AVX512_STORE(&q_dbl[5*offset], q6);
429

Andreas Marek's avatar
Andreas Marek committed
430 431 432 433
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
434

Andreas Marek's avatar
Andreas Marek committed
435 436 437 438 439 440
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
441

Andreas Marek's avatar
Andreas Marek committed
442
                tmp1 = _AVX512_MUL(h1_imag, x1);
443

Andreas Marek's avatar
Andreas Marek committed
444
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
445

Andreas Marek's avatar
Andreas Marek committed
446
                tmp2 = _AVX512_MUL(h1_imag, x2);
447

Andreas Marek's avatar
Andreas Marek committed
448
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
449

Andreas Marek's avatar
Andreas Marek committed
450
                tmp3 = _AVX512_MUL(h1_imag, x3);
451

Andreas Marek's avatar
Andreas Marek committed
452
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
453

Andreas Marek's avatar
Andreas Marek committed
454
                tmp4 = _AVX512_MUL(h1_imag, x4);
455

Andreas Marek's avatar
Andreas Marek committed
456
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
457

Andreas Marek's avatar
Andreas Marek committed
458
                tmp5 = _AVX512_MUL(h1_imag, x5);
459

Andreas Marek's avatar
Andreas Marek committed
460
                q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
461

Andreas Marek's avatar
Andreas Marek committed
462
                tmp6 = _AVX512_MUL(h1_imag, x6);
463

Andreas Marek's avatar
Andreas Marek committed
464
                q6 = _AVX512_ADD(q6, _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
465

Andreas Marek's avatar
Andreas Marek committed
466 467 468 469 470 471 472
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
        }
473 474
}

475 476 477 478 479 480 481 482 483
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
484 485
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
486 487
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
488 489
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
490
#endif
Andreas Marek's avatar
Andreas Marek committed
491 492 493 494 495
        __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
496 497

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
498
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
499 500
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
501
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
502 503 504 505 506 507 508 509 510 511
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16
#endif


Andreas Marek's avatar
Andreas Marek committed
512 513 514 515 516
        x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
        x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
517

Andreas Marek's avatar
Andreas Marek committed
518 519 520 521
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
522

Andreas Marek's avatar
Andreas Marek committed
523 524 525 526 527
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
528

Andreas Marek's avatar
Andreas Marek committed
529
                tmp1 = _AVX512_MUL(h1_imag, q1);
530

Andreas Marek's avatar
Andreas Marek committed
531
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
532

Andreas Marek's avatar
Andreas Marek committed
533
                tmp2 = _AVX512_MUL(h1_imag, q2);
534

Andreas Marek's avatar
Andreas Marek committed
535
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
536

Andreas Marek's avatar
Andreas Marek committed
537
                tmp3 = _AVX512_MUL(h1_imag, q3);
538

Andreas Marek's avatar
Andreas Marek committed
539
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
540

Andreas Marek's avatar
Andreas Marek committed
541
                tmp4 = _AVX512_MUL(h1_imag, q4);
542

Andreas Marek's avatar
Andreas Marek committed
543
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
544

Andreas Marek's avatar
Andreas Marek committed
545
                tmp5 = _AVX512_MUL(h1_imag, q5);
546

Andreas Marek's avatar
Andreas Marek committed
547
                x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
548

Andreas Marek's avatar
Andreas Marek committed
549
        }
550

Andreas Marek's avatar
Andreas Marek committed
551 552
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
553

Andreas Marek's avatar
Andreas Marek committed
554
#ifdef HAVE_AVX512_XEON_PHI
555 556 557 558 559 560 561
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
Andreas Marek's avatar
Andreas Marek committed
562 563 564 565 566 567 568
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
569 570
#endif

Andreas Marek's avatar
Andreas Marek committed
571
        tmp1 = _AVX512_MUL(h1_imag, x1);
572

Andreas Marek's avatar
Andreas Marek committed
573
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
574

Andreas Marek's avatar
Andreas Marek committed
575
        tmp2 = _AVX512_MUL(h1_imag, x2);
576

Andreas Marek's avatar
Andreas Marek committed
577
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
578

Andreas Marek's avatar
Andreas Marek committed
579
        tmp3 = _AVX512_MUL(h1_imag, x3);
580

Andreas Marek's avatar
Andreas Marek committed
581
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
582

Andreas Marek's avatar
Andreas Marek committed
583
        tmp4 = _AVX512_MUL(h1_imag, x4);
584

Andreas Marek's avatar
Andreas Marek committed
585
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
586

Andreas Marek's avatar
Andreas Marek committed
587
        tmp5 = _AVX512_MUL(h1_imag, x5);
588

Andreas Marek's avatar
Andreas Marek committed
589
        x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
590

Andreas Marek's avatar
Andreas Marek committed
591 592 593 594 595
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
        q5 = _AVX512_LOAD(&q_dbl[4*offset]);
596

Andreas Marek's avatar
Andreas Marek committed
597 598 599 600 601
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
        q5 = _AVX512_ADD(q5, x5);
602

Andreas Marek's avatar
Andreas Marek committed
603 604 605 606 607
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
        _AVX512_STORE(&q_dbl[4*offset], q5);
608

Andreas Marek's avatar
Andreas Marek committed
609 610 611 612
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
613

Andreas Marek's avatar
Andreas Marek committed
614 615 616 617 618
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
619

Andreas Marek's avatar
Andreas Marek committed
620
                tmp1 = _AVX512_MUL(h1_imag, x1);
621

Andreas Marek's avatar
Andreas Marek committed
622
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
623

Andreas Marek's avatar
Andreas Marek committed
624
                tmp2 = _AVX512_MUL(h1_imag, x2);
625

Andreas Marek's avatar
Andreas Marek committed
626
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
627

Andreas Marek's avatar
Andreas Marek committed
628
                tmp3 = _AVX512_MUL(h1_imag, x3);
629

Andreas Marek's avatar
Andreas Marek committed
630
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
631

Andreas Marek's avatar
Andreas Marek committed
632
                tmp4 = _AVX512_MUL(h1_imag, x4);
633

Andreas Marek's avatar
Andreas Marek committed
634
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
635

Andreas Marek's avatar
Andreas Marek committed
636
                tmp5 = _AVX512_MUL(h1_imag, x5);
637

Andreas Marek's avatar
Andreas Marek committed
638
                q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
639

Andreas Marek's avatar
Andreas Marek committed
640 641 642 643 644 645
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
        }
646 647 648
}


649 650 651 652 653 654 655 656 657
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
658 659
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
660 661
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
662 663
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
664 665
#endif

Andreas Marek's avatar
Andreas Marek committed
666 667 668 669 670
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
671 672

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
673
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
674 675
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
676
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
677 678
#endif

Andreas Marek's avatar
Andreas Marek committed
679 680 681 682
        x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);  // comlex 13 14 15 16
683

Andreas Marek's avatar
Andreas Marek committed
684 685 686 687
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
688

Andreas Marek's avatar
Andreas Marek committed
689 690 691 692
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
693

Andreas Marek's avatar
Andreas Marek committed
694
                tmp1 = _AVX512_MUL(h1_imag, q1);
695

Andreas Marek's avatar
Andreas Marek committed
696
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
697

Andreas Marek's avatar
Andreas Marek committed
698
                tmp2 = _AVX512_MUL(h1_imag, q2);
699

Andreas Marek's avatar
Andreas Marek committed
700
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
701

Andreas Marek's avatar
Andreas Marek committed
702
                tmp3 = _AVX512_MUL(h1_imag, q3);
703

Andreas Marek's avatar
Andreas Marek committed
704
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
705

Andreas Marek's avatar
Andreas Marek committed
706
                tmp4 = _AVX512_MUL(h1_imag, q4);
707

Andreas Marek's avatar
Andreas Marek committed
708 709
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
        }
710 711 712 713

	h1_real = _AVX512_SET1(hh_dbl[0]);
	h1_imag = _AVX512_SET1(hh_dbl[1]);

Andreas Marek's avatar
Andreas Marek committed
714
#ifdef HAVE_AVX512_XEON_PHI
715 716 717 718 719 720 721
#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
Andreas Marek's avatar
Andreas Marek committed
722 723 724 725 726 727 728
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
        h1_real = _AVX512_XOR(h1_real, sign);
        h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
729 730
#endif

Andreas Marek's avatar
Andreas Marek committed
731
        tmp1 = _AVX512_MUL(h1_imag, x1);
732

Andreas Marek's avatar
Andreas Marek committed
733
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
734

Andreas Marek's avatar
Andreas Marek committed
735
        tmp2 = _AVX512_MUL(h1_imag, x2);
736

Andreas Marek's avatar
Andreas Marek committed
737
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
738

Andreas Marek's avatar
Andreas Marek committed
739
        tmp3 = _AVX512_MUL(h1_imag, x3);
740

Andreas Marek's avatar
Andreas Marek committed
741
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
742

Andreas Marek's avatar
Andreas Marek committed
743
        tmp4 = _AVX512_MUL(h1_imag, x4);
744

Andreas Marek's avatar
Andreas Marek committed
745
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
746

Andreas Marek's avatar
Andreas Marek committed
747 748 749 750
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
751

Andreas Marek's avatar
Andreas Marek committed
752 753 754 755
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
756

Andreas Marek's avatar
Andreas Marek committed
757 758 759 760
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
761

Andreas Marek's avatar
Andreas Marek committed
762 763 764 765
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
766

Andreas Marek's avatar
Andreas Marek committed
767 768 769 770
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
771

Andreas Marek's avatar
Andreas Marek committed
772
                tmp1 = _AVX512_MUL(h1_imag, x1);
773

Andreas Marek's avatar
Andreas Marek committed
774
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
775

Andreas Marek's avatar
Andreas Marek committed
776
                tmp2 = _AVX512_MUL(h1_imag, x2);
777

Andreas Marek's avatar
Andreas Marek committed
778
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
779

Andreas Marek's avatar
Andreas Marek committed
780
                tmp3 = _AVX512_MUL(h1_imag, x3);
781

Andreas Marek's avatar
Andreas Marek committed
782
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
783

Andreas Marek's avatar
Andreas Marek committed
784
                tmp4 = _AVX512_MUL(h1_imag, x4);
785

Andreas Marek's avatar
Andreas Marek committed
786
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
787

Andreas Marek's avatar
Andreas Marek committed
788 789 790 791 792
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
793 794
}

795 796 797 798 799 800 801 802 803
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
804 805
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
806 807
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
808 809
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
810 811
#endif

Andreas Marek's avatar
Andreas Marek committed
812 813 814 815 816
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
817 818

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
819
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
820 821
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
822
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
823 824
#endif

Andreas Marek's avatar
Andreas Marek committed
825 826 827
        x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);
828

Andreas Marek's avatar
Andreas Marek committed
829 830 831 832
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
833

Andreas Marek's avatar
Andreas Marek committed
834 835 836
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
837

Andreas Marek's avatar
Andreas Marek committed
838
                tmp1 = _AVX512_MUL(h1_imag, q1);
839

Andreas Marek's avatar
Andreas Marek committed
840
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
841

Andreas Marek's avatar
Andreas Marek committed
842
                tmp2 = _AVX512_MUL(h1_imag, q2);
843