complex_avx512_1hv_template.c 37.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------


#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
53
#include <stdio.h>
Andreas Marek's avatar
Andreas Marek committed
54
#include <stdlib.h>
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define __AVX512_DATATYPE __m512d
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#define _AVX512_XOR _mm512_xor_pd
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)

#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_pd
#define _AVX512_FMSUBADD _mm512_FMSUBADD_pd
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define __AVX512_DATATYPE __m512
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#define _AVX512_XOR _mm512_xor_ps
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)

#endif

#define _AVX512_FMADDSUB _mm512_FMADDSUB_ps
#define _AVX512_FMSUBADD _mm512_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */


//Forward declaration
#ifdef DOUBLE_PRECISION_COMPLEX
static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
110
static  __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
111
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
112
static  __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
113
static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
114
static  __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
115 116 117 118
#endif

#ifdef SINGLE_PRECISION_COMPLEX
static  __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
119
static  __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
120
static  __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
121
static  __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
122
static  __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
123
static  __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
#endif


/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                 :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_float_complex)     :: q(*)
!f>     type(c_ptr), value                  :: q
!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef DOUBLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
163 164 165 166 167 168
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int worked_on;
        //int ldh = *pldh;
169

Andreas Marek's avatar
Andreas Marek committed
170
        worked_on = 0;
171

172
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
173 174 175 176 177
        for (i = 0; i < nq-20; i+=24)
        {
                hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 24;
        }
178
#endif
179

180
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
181 182 183 184 185
        for (i = 0; i < nq-40; i+=48)
        {
                hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 48;
        }
186
#endif
Andreas Marek's avatar
Andreas Marek committed
187 188 189 190
        if (nq == i)
        {
                return;
        }
191 192

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
193 194 195 196 197
        if (nq-i == 20)
        {
                hh_trafo_complex_kernel_20_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 20;
        }
198 199 200
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
201 202 203 204 205
        if (nq-i == 40)
        {
                hh_trafo_complex_kernel_40_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 40;
        }
206 207
#endif

208
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
209 210 211 212 213
        if (nq-i == 16)
        {
                hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 16;
        }
214
#endif
215

216
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
217 218 219 220 221
        if (nq-i == 32)
        {
                hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 32;
        }
222
#endif
223 224

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228 229
        if (nq-i == 12)
        {
                hh_trafo_complex_kernel_12_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 12;
        }
230 231 232
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
233 234 235 236 237
        if (nq-i == 24)
        {
                hh_trafo_complex_kernel_24_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 24;
        }
238 239
#endif

240
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
241 242 243 244 245
        if (nq-i == 8)
        {
                hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 8;
        }
246
#endif
247

248
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
249 250 251 252 253
        if (nq-i == 16)
        {
                hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 16;
        }
254
#endif
255 256

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
257 258 259 260 261
        if (nq-i == 4)
        {
                hh_trafo_complex_kernel_4_AVX512_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 4;
        }
262 263 264
#endif

#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
265 266 267 268 269
        if (nq-i == 8)
        {
                hh_trafo_complex_kernel_8_AVX512_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 8;
        }
270
#endif
271
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
272 273 274 275 276
        if (worked_on != nq)
        {
             printf("Error in complex AVX512 BLOCK 1 kernel \n");
             abort();
        }
277
#endif
278 279 280 281 282 283 284 285 286 287 288
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
289 290
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
291 292
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
293 294
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
295
#endif
Andreas Marek's avatar
Andreas Marek committed
296 297 298 299 300
        __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
301 302

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
303
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
304 305
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
306
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
307 308 309 310 311 312 313 314 315 316
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16
#endif


Andreas Marek's avatar
Andreas Marek committed
317 318 319 320 321 322
        x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
        x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
        x6 = _AVX512_LOAD(&q_dbl[5*offset]);   // complex 21, 22, 23, 24
323

Andreas Marek's avatar
Andreas Marek committed
324 325 326 327
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
328

Andreas Marek's avatar
Andreas Marek committed
329 330 331 332 333 334
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
335

Andreas Marek's avatar
Andreas Marek committed
336
                tmp1 = _AVX512_MUL(h1_imag, q1);
337

Andreas Marek's avatar
Andreas Marek committed
338
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
339

Andreas Marek's avatar
Andreas Marek committed
340
                tmp2 = _AVX512_MUL(h1_imag, q2);
341

Andreas Marek's avatar
Andreas Marek committed
342
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
343

Andreas Marek's avatar
Andreas Marek committed
344
                tmp3 = _AVX512_MUL(h1_imag, q3);
345

Andreas Marek's avatar
Andreas Marek committed
346
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
347

Andreas Marek's avatar
Andreas Marek committed
348
                tmp4 = _AVX512_MUL(h1_imag, q4);
349

Andreas Marek's avatar
Andreas Marek committed
350
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
351

Andreas Marek's avatar
Andreas Marek committed
352
                tmp5 = _AVX512_MUL(h1_imag, q5);
353

Andreas Marek's avatar
Andreas Marek committed
354
                x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
355

Andreas Marek's avatar
Andreas Marek committed
356
                tmp6 = _AVX512_MUL(h1_imag, q6);
357

Andreas Marek's avatar
Andreas Marek committed
358 359
                x6 = _AVX512_ADD(x6, _AVX512_FMSUBADD(h1_real, q6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
        }
360

Andreas Marek's avatar
Andreas Marek committed
361 362
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
363 364 365 366 367 368 369 370 371 372

#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif

Andreas Marek's avatar
Andreas Marek committed
373
        tmp1 = _AVX512_MUL(h1_imag, x1);
374

Andreas Marek's avatar
Andreas Marek committed
375
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
376

Andreas Marek's avatar
Andreas Marek committed
377
        tmp2 = _AVX512_MUL(h1_imag, x2);
378

Andreas Marek's avatar
Andreas Marek committed
379
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
380

Andreas Marek's avatar
Andreas Marek committed
381
        tmp3 = _AVX512_MUL(h1_imag, x3);
382

Andreas Marek's avatar
Andreas Marek committed
383
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
384

Andreas Marek's avatar
Andreas Marek committed
385
        tmp4 = _AVX512_MUL(h1_imag, x4);
386

Andreas Marek's avatar
Andreas Marek committed
387
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
388

Andreas Marek's avatar
Andreas Marek committed
389
        tmp5 = _AVX512_MUL(h1_imag, x5);
390

Andreas Marek's avatar
Andreas Marek committed
391
        x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
392

Andreas Marek's avatar
Andreas Marek committed
393
        tmp6 = _AVX512_MUL(h1_imag, x6);
394

Andreas Marek's avatar
Andreas Marek committed
395
        x6 = _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE));
396

Andreas Marek's avatar
Andreas Marek committed
397 398 399 400 401 402
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
        q5 = _AVX512_LOAD(&q_dbl[4*offset]);
        q6 = _AVX512_LOAD(&q_dbl[5*offset]);
403

Andreas Marek's avatar
Andreas Marek committed
404 405 406 407 408 409
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
        q5 = _AVX512_ADD(q5, x5);
        q6 = _AVX512_ADD(q6, x6);
410

Andreas Marek's avatar
Andreas Marek committed
411 412 413 414 415 416
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
        _AVX512_STORE(&q_dbl[4*offset], q5);
        _AVX512_STORE(&q_dbl[5*offset], q6);
417

Andreas Marek's avatar
Andreas Marek committed
418 419 420 421
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
422

Andreas Marek's avatar
Andreas Marek committed
423 424 425 426 427 428
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
429

Andreas Marek's avatar
Andreas Marek committed
430
                tmp1 = _AVX512_MUL(h1_imag, x1);
431

Andreas Marek's avatar
Andreas Marek committed
432
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
433

Andreas Marek's avatar
Andreas Marek committed
434
                tmp2 = _AVX512_MUL(h1_imag, x2);
435

Andreas Marek's avatar
Andreas Marek committed
436
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
437

Andreas Marek's avatar
Andreas Marek committed
438
                tmp3 = _AVX512_MUL(h1_imag, x3);
439

Andreas Marek's avatar
Andreas Marek committed
440
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
441

Andreas Marek's avatar
Andreas Marek committed
442
                tmp4 = _AVX512_MUL(h1_imag, x4);
443

Andreas Marek's avatar
Andreas Marek committed
444
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
445

Andreas Marek's avatar
Andreas Marek committed
446
                tmp5 = _AVX512_MUL(h1_imag, x5);
447

Andreas Marek's avatar
Andreas Marek committed
448
                q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
449

Andreas Marek's avatar
Andreas Marek committed
450
                tmp6 = _AVX512_MUL(h1_imag, x6);
451

Andreas Marek's avatar
Andreas Marek committed
452
                q6 = _AVX512_ADD(q6, _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
453

Andreas Marek's avatar
Andreas Marek committed
454 455 456 457 458 459 460
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
        }
461 462
}

463 464 465 466 467 468 469 470 471
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
472 473
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
474 475
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
476 477
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
478
#endif
Andreas Marek's avatar
Andreas Marek committed
479 480 481 482 483
        __AVX512_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX512_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
484 485

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
486
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
487 488
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
489
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
490 491 492 493 494 495 496 497 498 499
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16
#endif


Andreas Marek's avatar
Andreas Marek committed
500 501 502 503 504
        x1 = _AVX512_LOAD(&q_dbl[0]);    // complex 1, 2, 3, 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);    // complex 5, 6, 7, 8
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);   // complex 9, 10, 11, 12
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);   // complex 13, 14, 15, 16
        x5 = _AVX512_LOAD(&q_dbl[4*offset]);   // complex 17, 18, 19, 20
505

Andreas Marek's avatar
Andreas Marek committed
506 507 508 509
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
510

Andreas Marek's avatar
Andreas Marek committed
511 512 513 514 515
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
516

Andreas Marek's avatar
Andreas Marek committed
517
                tmp1 = _AVX512_MUL(h1_imag, q1);
518

Andreas Marek's avatar
Andreas Marek committed
519
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
520

Andreas Marek's avatar
Andreas Marek committed
521
                tmp2 = _AVX512_MUL(h1_imag, q2);
522

Andreas Marek's avatar
Andreas Marek committed
523
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
524

Andreas Marek's avatar
Andreas Marek committed
525
                tmp3 = _AVX512_MUL(h1_imag, q3);
526

Andreas Marek's avatar
Andreas Marek committed
527
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
528

Andreas Marek's avatar
Andreas Marek committed
529
                tmp4 = _AVX512_MUL(h1_imag, q4);
530

Andreas Marek's avatar
Andreas Marek committed
531
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
532

Andreas Marek's avatar
Andreas Marek committed
533
                tmp5 = _AVX512_MUL(h1_imag, q5);
534

Andreas Marek's avatar
Andreas Marek committed
535
                x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
536

Andreas Marek's avatar
Andreas Marek committed
537
        }
538

Andreas Marek's avatar
Andreas Marek committed
539 540
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
541 542 543 544 545 546 547 548 549 550

#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif

Andreas Marek's avatar
Andreas Marek committed
551
        tmp1 = _AVX512_MUL(h1_imag, x1);
552

Andreas Marek's avatar
Andreas Marek committed
553
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
554

Andreas Marek's avatar
Andreas Marek committed
555
        tmp2 = _AVX512_MUL(h1_imag, x2);
556

Andreas Marek's avatar
Andreas Marek committed
557
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
558

Andreas Marek's avatar
Andreas Marek committed
559
        tmp3 = _AVX512_MUL(h1_imag, x3);
560

Andreas Marek's avatar
Andreas Marek committed
561
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
562

Andreas Marek's avatar
Andreas Marek committed
563
        tmp4 = _AVX512_MUL(h1_imag, x4);
564

Andreas Marek's avatar
Andreas Marek committed
565
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
566

Andreas Marek's avatar
Andreas Marek committed
567
        tmp5 = _AVX512_MUL(h1_imag, x5);
568

Andreas Marek's avatar
Andreas Marek committed
569
        x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE));
570

Andreas Marek's avatar
Andreas Marek committed
571 572 573 574 575
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
        q5 = _AVX512_LOAD(&q_dbl[4*offset]);
576

Andreas Marek's avatar
Andreas Marek committed
577 578 579 580 581
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
        q5 = _AVX512_ADD(q5, x5);
582

Andreas Marek's avatar
Andreas Marek committed
583 584 585 586 587
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
        _AVX512_STORE(&q_dbl[4*offset], q5);
588

Andreas Marek's avatar
Andreas Marek committed
589 590 591 592
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
593

Andreas Marek's avatar
Andreas Marek committed
594 595 596 597 598
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
599

Andreas Marek's avatar
Andreas Marek committed
600
                tmp1 = _AVX512_MUL(h1_imag, x1);
601

Andreas Marek's avatar
Andreas Marek committed
602
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
603

Andreas Marek's avatar
Andreas Marek committed
604
                tmp2 = _AVX512_MUL(h1_imag, x2);
605

Andreas Marek's avatar
Andreas Marek committed
606
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
607

Andreas Marek's avatar
Andreas Marek committed
608
                tmp3 = _AVX512_MUL(h1_imag, x3);
609

Andreas Marek's avatar
Andreas Marek committed
610
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
611

Andreas Marek's avatar
Andreas Marek committed
612
                tmp4 = _AVX512_MUL(h1_imag, x4);
613

Andreas Marek's avatar
Andreas Marek committed
614
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
615

Andreas Marek's avatar
Andreas Marek committed
616
                tmp5 = _AVX512_MUL(h1_imag, x5);
617

Andreas Marek's avatar
Andreas Marek committed
618
                q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
619

Andreas Marek's avatar
Andreas Marek committed
620 621 622 623 624 625
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
        }
626 627 628
}


629 630 631 632 633 634 635 636 637
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
638 639
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
640 641
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
642 643
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
644 645
#endif

Andreas Marek's avatar
Andreas Marek committed
646 647 648 649 650
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
651 652

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
653
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
654 655
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
656
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
657 658
#endif

Andreas Marek's avatar
Andreas Marek committed
659 660 661 662
        x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);
        x4 = _AVX512_LOAD(&q_dbl[3*offset]);  // comlex 13 14 15 16
663

Andreas Marek's avatar
Andreas Marek committed
664 665 666 667
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
668

Andreas Marek's avatar
Andreas Marek committed
669 670 671 672
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
673

Andreas Marek's avatar
Andreas Marek committed
674
                tmp1 = _AVX512_MUL(h1_imag, q1);
675

Andreas Marek's avatar
Andreas Marek committed
676
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
677

Andreas Marek's avatar
Andreas Marek committed
678
                tmp2 = _AVX512_MUL(h1_imag, q2);
679

Andreas Marek's avatar
Andreas Marek committed
680
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
681

Andreas Marek's avatar
Andreas Marek committed
682
                tmp3 = _AVX512_MUL(h1_imag, q3);
683

Andreas Marek's avatar
Andreas Marek committed
684
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
685

Andreas Marek's avatar
Andreas Marek committed
686
                tmp4 = _AVX512_MUL(h1_imag, q4);
687

Andreas Marek's avatar
Andreas Marek committed
688 689
                x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
        }
690

Andreas Marek's avatar
Andreas Marek committed
691 692
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
693 694 695 696 697 698 699 700 701 702

#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif

Andreas Marek's avatar
Andreas Marek committed
703
        tmp1 = _AVX512_MUL(h1_imag, x1);
704

Andreas Marek's avatar
Andreas Marek committed
705
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
706

Andreas Marek's avatar
Andreas Marek committed
707
        tmp2 = _AVX512_MUL(h1_imag, x2);
708

Andreas Marek's avatar
Andreas Marek committed
709
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
710

Andreas Marek's avatar
Andreas Marek committed
711
        tmp3 = _AVX512_MUL(h1_imag, x3);
712

Andreas Marek's avatar
Andreas Marek committed
713
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
714

Andreas Marek's avatar
Andreas Marek committed
715
        tmp4 = _AVX512_MUL(h1_imag, x4);
716

Andreas Marek's avatar
Andreas Marek committed
717
        x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE));
718

Andreas Marek's avatar
Andreas Marek committed
719 720 721 722
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
        q4 = _AVX512_LOAD(&q_dbl[3*offset]);
723

Andreas Marek's avatar
Andreas Marek committed
724 725 726 727
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
        q4 = _AVX512_ADD(q4, x4);
728

Andreas Marek's avatar
Andreas Marek committed
729 730 731 732
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
        _AVX512_STORE(&q_dbl[3*offset], q4);
733

Andreas Marek's avatar
Andreas Marek committed
734 735 736 737
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
738

Andreas Marek's avatar
Andreas Marek committed
739 740 741 742
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
743

Andreas Marek's avatar
Andreas Marek committed
744
                tmp1 = _AVX512_MUL(h1_imag, x1);
745

Andreas Marek's avatar
Andreas Marek committed
746
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
747

Andreas Marek's avatar
Andreas Marek committed
748
                tmp2 = _AVX512_MUL(h1_imag, x2);
749

Andreas Marek's avatar
Andreas Marek committed
750
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
751

Andreas Marek's avatar
Andreas Marek committed
752
                tmp3 = _AVX512_MUL(h1_imag, x3);
753

Andreas Marek's avatar
Andreas Marek committed
754
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
755

Andreas Marek's avatar
Andreas Marek committed
756
                tmp4 = _AVX512_MUL(h1_imag, x4);
757

Andreas Marek's avatar
Andreas Marek committed
758
                q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
759

Andreas Marek's avatar
Andreas Marek committed
760 761 762 763 764
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
765 766
}

767 768 769 770 771 772 773 774 775
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
776 777
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
778 779
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
780 781
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
782 783
#endif

Andreas Marek's avatar
Andreas Marek committed
784 785 786 787 788
        __AVX512_DATATYPE x1, x2, x3, x4;
        __AVX512_DATATYPE q1, q2, q3, q4;
        __AVX512_DATATYPE h1_real, h1_imag;
        __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
789 790

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
791
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
792 793
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
794
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
795 796
#endif

Andreas Marek's avatar
Andreas Marek committed
797 798 799
        x1 = _AVX512_LOAD(&q_dbl[0]);   // complex 1 2 3 4
        x2 = _AVX512_LOAD(&q_dbl[offset]);
        x3 = _AVX512_LOAD(&q_dbl[2*offset]);
800

Andreas Marek's avatar
Andreas Marek committed
801 802 803 804
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
805

Andreas Marek's avatar
Andreas Marek committed
806 807 808
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
809

Andreas Marek's avatar
Andreas Marek committed
810
                tmp1 = _AVX512_MUL(h1_imag, q1);
811

Andreas Marek's avatar
Andreas Marek committed
812
                x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
813

Andreas Marek's avatar
Andreas Marek committed
814
                tmp2 = _AVX512_MUL(h1_imag, q2);
815

Andreas Marek's avatar
Andreas Marek committed
816
                x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
817

Andreas Marek's avatar
Andreas Marek committed
818
                tmp3 = _AVX512_MUL(h1_imag, q3);
819

Andreas Marek's avatar
Andreas Marek committed
820
                x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
821

Andreas Marek's avatar
Andreas Marek committed
822
        }
823

Andreas Marek's avatar
Andreas Marek committed
824 825
        h1_real = _AVX512_SET1(hh_dbl[0]);
        h1_imag = _AVX512_SET1(hh_dbl[1]);
826 827 828 829 830 831 832 833 834 835

#ifdef DOUBLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
        h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
        h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif

Andreas Marek's avatar
Andreas Marek committed
836
        tmp1 = _AVX512_MUL(h1_imag, x1);
837

Andreas Marek's avatar
Andreas Marek committed
838
        x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
839

Andreas Marek's avatar
Andreas Marek committed
840
        tmp2 = _AVX512_MUL(h1_imag, x2);
841

Andreas Marek's avatar
Andreas Marek committed
842
        x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
843

Andreas Marek's avatar
Andreas Marek committed
844
        tmp3 = _AVX512_MUL(h1_imag, x3);
845

Andreas Marek's avatar
Andreas Marek committed
846
        x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
847

Andreas Marek's avatar
Andreas Marek committed
848 849 850
        q1 = _AVX512_LOAD(&q_dbl[0]);
        q2 = _AVX512_LOAD(&q_dbl[offset]);
        q3 = _AVX512_LOAD(&q_dbl[2*offset]);
851

Andreas Marek's avatar
Andreas Marek committed
852 853 854
        q1 = _AVX512_ADD(q1, x1);
        q2 = _AVX512_ADD(q2, x2);
        q3 = _AVX512_ADD(q3, x3);
855

Andreas Marek's avatar
Andreas Marek committed
856 857 858
        _AVX512_STORE(&q_dbl[0], q1);
        _AVX512_STORE(&q_dbl[offset], q2);
        _AVX512_STORE(&q_dbl[2*offset], q3);
859

Andreas Marek's avatar
Andreas Marek committed
860 861 862 863
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX512_SET1(hh_dbl[i*2]);
                h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]);
864

Andreas Marek's avatar
Andreas Marek committed
865 866 867
                q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
868

Andreas Marek's avatar
Andreas Marek committed
869
                tmp1 = _AVX512_MUL(h1_imag, x1);
870

Andreas Marek's avatar
Andreas Marek committed
871
                q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
872

Andreas Marek's avatar
Andreas Marek committed
873
                tmp2 = _AVX512_MUL(h1_imag, x2);
874

Andreas Marek's avatar
Andreas Marek committed
875
                q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
876

Andreas Marek's avatar
Andreas Marek committed
877
                tmp3 = _AVX512_MUL(h1_imag, x3);
878

Andreas Marek's avatar
Andreas Marek committed
879
                q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
880

Andreas Marek's avatar
Andreas Marek committed
881 882 883 884
                _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
        }
885 886 887
}


888 889 890 891 892 893