complex_avx-avx2_1hv_template.c 44.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
66
#include <stdio.h>
67
#include <stdlib.h>
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_MUL _mm256_mul_pd
#define _AVX_ADDSUB _mm256_addsub_pd
#define _AVX_XOR _mm256_xor_pd
#define _AVX_BROADCAST _mm256_broadcast_sd
#define _AVX_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif

#endif

#define _AVX_FMADDSUB _mm256_FMADDSUB_pd
#define _AVX_FMSUBADD _mm256_FMSUBADD_pd

#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_MUL _mm256_mul_ps
#define _AVX_ADDSUB _mm256_addsub_ps
#define _AVX_XOR _mm256_xor_ps
#define _AVX_BROADCAST _mm256_broadcast_ss
#define _AVX_SHUFFLE _mm256_shuffle_ps
#define _SHUFFLE 0xb1

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif

#endif

#define _AVX_FMADDSUB _mm256_FMADDSUB_ps
#define _AVX_FMSUBADD _mm256_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
141
static  __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
142
static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
143
static  __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
144
static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
145
static  __forceinline void hh_trafo_complex_kernel_2_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
146 147 148
#endif
#ifdef SINGLE_PRECISION_COMPLEX
//Forward declaration
149 150 151
static  __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx_avx2_1hv_double(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)       :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value              :: q
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx_avx2_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
197 198 199 200 201 202
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        //int ldh = *pldh;
        int worked_on;
203

Andreas Marek's avatar
Andreas Marek committed
204
        worked_on = 0;
205 206

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
207 208 209 210 211
        for (i = 0; i < nq-10; i+=12)
        {
                hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 12;
        }
212 213
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
214 215 216 217 218 219 220 221 222 223
        for (i = 0; i < nq-20; i+=24)
        {
                hh_trafo_complex_kernel_24_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 24;
        }
#endif
        if (nq == i)
        {
                return;
        }
224
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228
        if (nq-i == 10)
        {
                hh_trafo_complex_kernel_10_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 10;
229 230 231
        }
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
232 233 234 235
        if (nq-i == 20)
        {
                hh_trafo_complex_kernel_20_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 20;
236 237
        }
#endif
238

239
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
240 241 242 243
        if (nq-i == 8)
        {
                hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 8;
244
        }
245 246
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
247 248 249 250
        if (nq-i == 16)
        {
                hh_trafo_complex_kernel_16_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 16;
251 252 253 254
        }
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
255 256 257 258
        if (nq-i == 6)
        {
                hh_trafo_complex_kernel_6_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 6;
259
        }
260
#endif
261
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
262 263 264 265
        if (nq-i == 12)
        {
                hh_trafo_complex_kernel_12_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 12;
266 267
        }
#endif
268
#ifdef DOUBLE_PRECISION_COMPLEX
269 270
       if (nq-i == 4)
       {
Andreas Marek's avatar
Andreas Marek committed
271 272
                hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 4;
273
       }
274 275
#endif
#ifdef SINGLE_PRECISION_COMPLEX
276 277
       if (nq-i == 8)
       {
Andreas Marek's avatar
Andreas Marek committed
278 279
                hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 8;
280
       }
281 282
#endif

283 284 285
#ifdef DOUBLE_PRECISION_COMPLEX
       if (nq-i == 2)
       {
Andreas Marek's avatar
Andreas Marek committed
286 287
                hh_trafo_complex_kernel_2_AVX_1hv_double(&q[i], hh, nb, ldq);
                worked_on += 2;
288 289 290 291 292
       }
#endif
#ifdef SINGLE_PRECISION_COMPLEX
       if (nq-i == 4)
       {
Andreas Marek's avatar
Andreas Marek committed
293 294
                hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
                worked_on += 4;
295 296
       }
#endif
297
#ifdef WITH_DEBUG
298
       if (worked_on != nq) {
299
         printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
Andreas Marek's avatar
Andreas Marek committed
300 301
         abort();
        }
302
#endif
303 304
}

305

306 307 308 309
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
310
static __forceinline void hh_trafo_complex_kernel_24_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
311 312 313
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
314 315
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
316 317
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
318 319
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
320
#endif
Andreas Marek's avatar
Andreas Marek committed
321 322 323 324 325
        __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX_DATATYPE h1_real, h1_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
326 327

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
328
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
329 330
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
331 332 333 334 335 336 337 338 339 340 341 342 343
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif

        x1 = _AVX_LOAD(&q_dbl[0]);
        x2 = _AVX_LOAD(&q_dbl[offset]);
        x3 = _AVX_LOAD(&q_dbl[2*offset]);
        x4 = _AVX_LOAD(&q_dbl[3*offset]);
        x5 = _AVX_LOAD(&q_dbl[4*offset]);
        x6 = _AVX_LOAD(&q_dbl[5*offset]);
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
344
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
345 346
                // conjugate
                h1_imag = _AVX_XOR(h1_imag, sign);
347 348
#endif

Andreas Marek's avatar
Andreas Marek committed
349 350 351 352 353 354
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
355

Andreas Marek's avatar
Andreas Marek committed
356
                tmp1 = _AVX_MUL(h1_imag, q1);
357
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
358
                x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
359
#else
Andreas Marek's avatar
Andreas Marek committed
360
                x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
361
#endif
Andreas Marek's avatar
Andreas Marek committed
362
                tmp2 = _AVX_MUL(h1_imag, q2);
363
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
364
                x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
365
#else
Andreas Marek's avatar
Andreas Marek committed
366
                x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
367
#endif
Andreas Marek's avatar
Andreas Marek committed
368
                tmp3 = _AVX_MUL(h1_imag, q3);
369
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
370
                x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
371
#else
Andreas Marek's avatar
Andreas Marek committed
372
                x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
373 374
#endif

Andreas Marek's avatar
Andreas Marek committed
375
                tmp4 = _AVX_MUL(h1_imag, q4);
376
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
377
                x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
378
#else
Andreas Marek's avatar
Andreas Marek committed
379
                x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
380
#endif
Andreas Marek's avatar
Andreas Marek committed
381
                tmp5 = _AVX_MUL(h1_imag, q5);
382
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
383
                x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
384
#else
Andreas Marek's avatar
Andreas Marek committed
385
                x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
386
#endif
Andreas Marek's avatar
Andreas Marek committed
387
                tmp6 = _AVX_MUL(h1_imag, q6);
388
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
389
                x6 = _AVX_ADD(x6, _AVX_FMSUBADD(h1_real, q6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
390
#else
Andreas Marek's avatar
Andreas Marek committed
391
                x6 = _AVX_ADD(x6, _AVX_ADDSUB( _AVX_MUL(h1_real, q6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
392
#endif
Andreas Marek's avatar
Andreas Marek committed
393
        }
394

Andreas Marek's avatar
Andreas Marek committed
395 396 397 398
        h1_real = _AVX_BROADCAST(&hh_dbl[0]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
399

Andreas Marek's avatar
Andreas Marek committed
400
        tmp1 = _AVX_MUL(h1_imag, x1);
401
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
402
        x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
403
#else
Andreas Marek's avatar
Andreas Marek committed
404
        x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
405
#endif
Andreas Marek's avatar
Andreas Marek committed
406
        tmp2 = _AVX_MUL(h1_imag, x2);
407
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
408
        x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
409
#else
Andreas Marek's avatar
Andreas Marek committed
410
        x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
411
#endif
Andreas Marek's avatar
Andreas Marek committed
412
        tmp3 = _AVX_MUL(h1_imag, x3);
413
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
414
        x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
415
#else
Andreas Marek's avatar
Andreas Marek committed
416
        x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
417 418
#endif

Andreas Marek's avatar
Andreas Marek committed
419
        tmp4 = _AVX_MUL(h1_imag, x4);
420
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
421
        x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
422
#else
Andreas Marek's avatar
Andreas Marek committed
423
        x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
424
#endif
Andreas Marek's avatar
Andreas Marek committed
425
        tmp5 = _AVX_MUL(h1_imag, x5);
426
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
427
        x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
428
#else
Andreas Marek's avatar
Andreas Marek committed
429
        x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
430
#endif
Andreas Marek's avatar
Andreas Marek committed
431
        tmp6 = _AVX_MUL(h1_imag, x6);
432
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
433
        x6 = _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
434
#else
Andreas Marek's avatar
Andreas Marek committed
435
        x6 = _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE));
436 437
#endif

Andreas Marek's avatar
Andreas Marek committed
438 439 440 441 442 443
        q1 = _AVX_LOAD(&q_dbl[0]);
        q2 = _AVX_LOAD(&q_dbl[offset]);
        q3 = _AVX_LOAD(&q_dbl[2*offset]);
        q4 = _AVX_LOAD(&q_dbl[3*offset]);
        q5 = _AVX_LOAD(&q_dbl[4*offset]);
        q6 = _AVX_LOAD(&q_dbl[5*offset]);
444 445

        q1 = _AVX_ADD(q1, x1);
Andreas Marek's avatar
Andreas Marek committed
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
        q2 = _AVX_ADD(q2, x2);
        q3 = _AVX_ADD(q3, x3);
        q4 = _AVX_ADD(q4, x4);
        q5 = _AVX_ADD(q5, x5);
        q6 = _AVX_ADD(q6, x6);

        _AVX_STORE(&q_dbl[0], q1);
        _AVX_STORE(&q_dbl[offset], q2);
        _AVX_STORE(&q_dbl[2*offset], q3);
        _AVX_STORE(&q_dbl[3*offset], q4);
        _AVX_STORE(&q_dbl[4*offset], q5);
        _AVX_STORE(&q_dbl[5*offset], q6);

        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);

                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _AVX_LOAD(&q_dbl[(2*i*ldq)+5*offset]);

                tmp1 = _AVX_MUL(h1_imag, x1);
472
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
473
                q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
474
#else
Andreas Marek's avatar
Andreas Marek committed
475
                q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
476
#endif
Andreas Marek's avatar
Andreas Marek committed
477
                tmp2 = _AVX_MUL(h1_imag, x2);
478
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
479
                q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
480
#else
Andreas Marek's avatar
Andreas Marek committed
481
                q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
482
#endif
Andreas Marek's avatar
Andreas Marek committed
483
                tmp3 = _AVX_MUL(h1_imag, x3);
484
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
485
                q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
486
#else
Andreas Marek's avatar
Andreas Marek committed
487
                q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
488 489
#endif

Andreas Marek's avatar
Andreas Marek committed
490
                tmp4 = _AVX_MUL(h1_imag, x4);
491
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
492
                q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
493
#else
Andreas Marek's avatar
Andreas Marek committed
494
                q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
495
#endif
Andreas Marek's avatar
Andreas Marek committed
496
                tmp5 = _AVX_MUL(h1_imag, x5);
497
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
498
                q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
499
#else
Andreas Marek's avatar
Andreas Marek committed
500
                q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
501
#endif
Andreas Marek's avatar
Andreas Marek committed
502
                tmp6 = _AVX_MUL(h1_imag, x6);
503
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
504
                q6 = _AVX_ADD(q6, _AVX_FMADDSUB(h1_real, x6, _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
505
#else
Andreas Marek's avatar
Andreas Marek committed
506
                q6 = _AVX_ADD(q6, _AVX_ADDSUB( _AVX_MUL(h1_real, x6), _AVX_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
507 508
#endif

Andreas Marek's avatar
Andreas Marek committed
509 510 511 512 513 514 515
                _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
                _AVX_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
        }
516 517 518 519 520 521 522 523 524 525
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_10_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_20_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
526 527
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
528 529
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
530 531
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
532
#endif
Andreas Marek's avatar
Andreas Marek committed
533 534 535 536 537
        __AVX_DATATYPE x1, x2, x3, x4, x5, x6;
        __AVX_DATATYPE q1, q2, q3, q4, q5, q6;
        __AVX_DATATYPE h1_real, h1_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
538 539

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
540
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
541 542
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
543 544 545 546 547 548 549 550 551 552 553 554
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif

        x1 = _AVX_LOAD(&q_dbl[0]);
        x2 = _AVX_LOAD(&q_dbl[offset]);
        x3 = _AVX_LOAD(&q_dbl[2*offset]);
        x4 = _AVX_LOAD(&q_dbl[3*offset]);
        x5 = _AVX_LOAD(&q_dbl[4*offset]);
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
555
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
556 557
                // conjugate
                h1_imag = _AVX_XOR(h1_imag, sign);
558 559
#endif

Andreas Marek's avatar
Andreas Marek committed
560 561 562 563 564
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
565

Andreas Marek's avatar
Andreas Marek committed
566
                tmp1 = _AVX_MUL(h1_imag, q1);
567
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
568
                x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
569
#else
Andreas Marek's avatar
Andreas Marek committed
570
                x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
571
#endif
Andreas Marek's avatar
Andreas Marek committed
572
                tmp2 = _AVX_MUL(h1_imag, q2);
573
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
574
                x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
575
#else
Andreas Marek's avatar
Andreas Marek committed
576
                x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
577
#endif
Andreas Marek's avatar
Andreas Marek committed
578
                tmp3 = _AVX_MUL(h1_imag, q3);
579
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
580
                x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
581
#else
Andreas Marek's avatar
Andreas Marek committed
582
                x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
583 584
#endif

Andreas Marek's avatar
Andreas Marek committed
585
                tmp4 = _AVX_MUL(h1_imag, q4);
586
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
587
                x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
588
#else
Andreas Marek's avatar
Andreas Marek committed
589
                x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
590
#endif
Andreas Marek's avatar
Andreas Marek committed
591
                tmp5 = _AVX_MUL(h1_imag, q5);
592
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
593
                x5 = _AVX_ADD(x5, _AVX_FMSUBADD(h1_real, q5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
594
#else
Andreas Marek's avatar
Andreas Marek committed
595
                x5 = _AVX_ADD(x5, _AVX_ADDSUB( _AVX_MUL(h1_real, q5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
596
#endif
Andreas Marek's avatar
Andreas Marek committed
597
        }
598

Andreas Marek's avatar
Andreas Marek committed
599 600 601 602
        h1_real = _AVX_BROADCAST(&hh_dbl[0]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
603

Andreas Marek's avatar
Andreas Marek committed
604
        tmp1 = _AVX_MUL(h1_imag, x1);
605
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
606
        x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
607
#else
Andreas Marek's avatar
Andreas Marek committed
608
        x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
609
#endif
Andreas Marek's avatar
Andreas Marek committed
610
        tmp2 = _AVX_MUL(h1_imag, x2);
611
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
612
        x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
613
#else
Andreas Marek's avatar
Andreas Marek committed
614
        x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
615
#endif
Andreas Marek's avatar
Andreas Marek committed
616
        tmp3 = _AVX_MUL(h1_imag, x3);
617
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
618
        x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
619
#else
Andreas Marek's avatar
Andreas Marek committed
620
        x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
621 622
#endif

Andreas Marek's avatar
Andreas Marek committed
623
        tmp4 = _AVX_MUL(h1_imag, x4);
624
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
625
        x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
626
#else
Andreas Marek's avatar
Andreas Marek committed
627
        x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
628
#endif
Andreas Marek's avatar
Andreas Marek committed
629
        tmp5 = _AVX_MUL(h1_imag, x5);
630
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
631
        x5 = _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
632
#else
Andreas Marek's avatar
Andreas Marek committed
633
        x5 = _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE));
634 635
#endif

Andreas Marek's avatar
Andreas Marek committed
636 637 638 639 640
        q1 = _AVX_LOAD(&q_dbl[0]);
        q2 = _AVX_LOAD(&q_dbl[offset]);
        q3 = _AVX_LOAD(&q_dbl[2*offset]);
        q4 = _AVX_LOAD(&q_dbl[3*offset]);
        q5 = _AVX_LOAD(&q_dbl[4*offset]);
641 642

        q1 = _AVX_ADD(q1, x1);
Andreas Marek's avatar
Andreas Marek committed
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
        q2 = _AVX_ADD(q2, x2);
        q3 = _AVX_ADD(q3, x3);
        q4 = _AVX_ADD(q4, x4);
        q5 = _AVX_ADD(q5, x5);

        _AVX_STORE(&q_dbl[0], q1);
        _AVX_STORE(&q_dbl[offset], q2);
        _AVX_STORE(&q_dbl[2*offset], q3);
        _AVX_STORE(&q_dbl[3*offset], q4);
        _AVX_STORE(&q_dbl[4*offset], q5);

        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);

                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _AVX_LOAD(&q_dbl[(2*i*ldq)+4*offset]);

                tmp1 = _AVX_MUL(h1_imag, x1);
666
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
667
                q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
668
#else
Andreas Marek's avatar
Andreas Marek committed
669
                q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
670
#endif
Andreas Marek's avatar
Andreas Marek committed
671
                tmp2 = _AVX_MUL(h1_imag, x2);
672
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
673
                q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
674
#else
Andreas Marek's avatar
Andreas Marek committed
675
                q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
676
#endif
Andreas Marek's avatar
Andreas Marek committed
677
                tmp3 = _AVX_MUL(h1_imag, x3);
678
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
679
                q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
680
#else
Andreas Marek's avatar
Andreas Marek committed
681
                q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
682 683
#endif

Andreas Marek's avatar
Andreas Marek committed
684
                tmp4 = _AVX_MUL(h1_imag, x4);
685
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
686
                q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
687
#else
Andreas Marek's avatar
Andreas Marek committed
688
                q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
689
#endif
Andreas Marek's avatar
Andreas Marek committed
690
                tmp5 = _AVX_MUL(h1_imag, x5);
691
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
692
                q5 = _AVX_ADD(q5, _AVX_FMADDSUB(h1_real, x5, _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
693
#else
Andreas Marek's avatar
Andreas Marek committed
694
                q5 = _AVX_ADD(q5, _AVX_ADDSUB( _AVX_MUL(h1_real, x5), _AVX_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
695 696
#endif

Andreas Marek's avatar
Andreas Marek committed
697 698 699 700 701 702
                _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _AVX_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
        }
703 704
}

705

706 707 708 709
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
710
static __forceinline void hh_trafo_complex_kernel_16_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
711 712 713 714
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
715 716
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
717 718
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
719 720
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
721
#endif
Andreas Marek's avatar
Andreas Marek committed
722 723 724 725 726
        __AVX_DATATYPE x1, x2, x3, x4;
        __AVX_DATATYPE q1, q2, q3, q4;
        __AVX_DATATYPE h1_real, h1_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
727 728

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
729
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
730 731
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
732
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
733 734
#endif

Andreas Marek's avatar
Andreas Marek committed
735 736 737 738
        x1 = _AVX_LOAD(&q_dbl[0]);
        x2 = _AVX_LOAD(&q_dbl[offset]);
        x3 = _AVX_LOAD(&q_dbl[2*offset]);
        x4 = _AVX_LOAD(&q_dbl[3*offset]);
739

Andreas Marek's avatar
Andreas Marek committed
740 741 742 743
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
744
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
745 746
                // conjugate
                h1_imag = _AVX_XOR(h1_imag, sign);
747 748 749
#endif


Andreas Marek's avatar
Andreas Marek committed
750 751 752 753
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
754 755

                tmp1 = _AVX_MUL(h1_imag, q1);
756
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
757
                x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
758
#else
Andreas Marek's avatar
Andreas Marek committed
759
                x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
760
#endif
Andreas Marek's avatar
Andreas Marek committed
761
                tmp2 = _AVX_MUL(h1_imag, q2);
762
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
763
                x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
764
#else
Andreas Marek's avatar
Andreas Marek committed
765
                x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
766 767
#endif

Andreas Marek's avatar
Andreas Marek committed
768
                tmp3 = _AVX_MUL(h1_imag, q3);
769
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
770
                x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
771
#else
Andreas Marek's avatar
Andreas Marek committed
772
                x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
773
#endif
Andreas Marek's avatar
Andreas Marek committed
774
                tmp4 = _AVX_MUL(h1_imag, q4);
775
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
776
                x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
777
#else
Andreas Marek's avatar
Andreas Marek committed
778
                x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
779
#endif
Andreas Marek's avatar
Andreas Marek committed
780
        }
781

Andreas Marek's avatar
Andreas Marek committed
782 783 784 785
        h1_real = _AVX_BROADCAST(&hh_dbl[0]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
786

Andreas Marek's avatar
Andreas Marek committed
787
        tmp1 = _AVX_MUL(h1_imag, x1);
788
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
789
        x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
790
#else
Andreas Marek's avatar
Andreas Marek committed
791
        x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
792
#endif
Andreas Marek's avatar
Andreas Marek committed
793
        tmp2 = _AVX_MUL(h1_imag, x2);
794
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
795
        x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
796
#else
Andreas Marek's avatar
Andreas Marek committed
797
        x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
798 799
#endif

Andreas Marek's avatar
Andreas Marek committed
800
        tmp3 = _AVX_MUL(h1_imag, x3);
801
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
802
        x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
803
#else
Andreas Marek's avatar
Andreas Marek committed
804
        x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
805
#endif
Andreas Marek's avatar
Andreas Marek committed
806
        tmp4 = _AVX_MUL(h1_imag, x4);
807
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
808
        x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
809
#else
Andreas Marek's avatar
Andreas Marek committed
810
        x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
811 812
#endif

Andreas Marek's avatar
Andreas Marek committed
813 814 815 816
        q1 = _AVX_LOAD(&q_dbl[0]);
        q2 = _AVX_LOAD(&q_dbl[offset]);
        q3 = _AVX_LOAD(&q_dbl[2*offset]);
        q4 = _AVX_LOAD(&q_dbl[3*offset]);
817

Andreas Marek's avatar
Andreas Marek committed
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
        q1 = _AVX_ADD(q1, x1);
        q2 = _AVX_ADD(q2, x2);
        q3 = _AVX_ADD(q3, x3);
        q4 = _AVX_ADD(q4, x4);

        _AVX_STORE(&q_dbl[0], q1);
        _AVX_STORE(&q_dbl[offset], q2);
        _AVX_STORE(&q_dbl[2*offset], q3);
        _AVX_STORE(&q_dbl[3*offset], q4);

        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);

                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);

                tmp1 = _AVX_MUL(h1_imag, x1);
839
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
840
                q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
841
#else
Andreas Marek's avatar
Andreas Marek committed
842
                q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
843
#endif
Andreas Marek's avatar
Andreas Marek committed
844
                tmp2 = _AVX_MUL(h1_imag, x2);
845
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
846
                q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
847
#else
Andreas Marek's avatar
Andreas Marek committed
848
                q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
849 850
#endif

Andreas Marek's avatar
Andreas Marek committed
851
                tmp3 = _AVX_MUL(h1_imag, x3);
852
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
853
                q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
854
#else
Andreas Marek's avatar
Andreas Marek committed
855
                q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
856
#endif
Andreas Marek's avatar
Andreas Marek committed
857
                tmp4 = _AVX_MUL(h1_imag, x4);
858
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
859
                q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
860
#else
Andreas Marek's avatar
Andreas Marek committed
861
                q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
862 863
#endif

Andreas Marek's avatar
Andreas Marek committed
864 865 866 867 868
                _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
869 870 871 872 873 874 875 876 877 878 879
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_6_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
880 881
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
882 883
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
884 885
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
886
#endif
Andreas Marek's avatar
Andreas Marek committed
887 888 889 890 891
        __AVX_DATATYPE x1, x2, x3, x4;
        __AVX_DATATYPE q1, q2, q3, q4;
        __AVX_DATATYPE h1_real, h1_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
892 893

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
894
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
895 896
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
897
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
898 899
#endif

Andreas Marek's avatar
Andreas Marek committed
900 901 902
        x1 = _AVX_LOAD(&q_dbl[0]);
        x2 = _AVX_LOAD(&q_dbl[offset]);
        x3 = _AVX_LOAD(&q_dbl[2*offset]);
903

Andreas Marek's avatar
Andreas Marek committed
904 905 906 907
        for (i = 1; i < nb; i++)
        {
                h1_real = _AVX_BROADCAST(&hh_dbl[i*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[(i*2)+1]);
908
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
909 910
                // conjugate
                h1_imag = _AVX_XOR(h1_imag, sign);
911 912 913
#endif


Andreas Marek's avatar
Andreas Marek committed
914 915 916
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
917 918 919

                tmp1 = _AVX_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
920
                x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
921
#else
Andreas Marek's avatar
Andreas Marek committed
922
                x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
923
#endif
Andreas Marek's avatar
Andreas Marek committed
924
                tmp2 = _AVX_MUL(h1_imag, q2);
925
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
926
                x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
927
#else
Andreas Marek's avatar
Andreas Marek committed
928
                x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
929 930
#endif

Andreas Marek's avatar
Andreas Marek committed
931
                tmp3 = _AVX_MUL(h1_imag, q3);
932
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
933
                x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
934
#else
Andreas Marek's avatar
Andreas Marek committed
935
                x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
936
#endif
Andreas Marek's avatar
Andreas Marek committed
937
        }
938

Andreas Marek's avatar
Andreas Marek committed
939 940 941 942
        h1_real = _AVX_BROADCAST(&hh_dbl[0]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
943

Andreas Marek's avatar
Andreas Marek committed
944
        tmp1 = _AVX_MUL(h1_imag, x1);
945
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
946
        x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
947
#else
Andreas Marek's avatar
Andreas Marek committed
948
        x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
949
#endif
Andreas Marek's avatar
Andreas Marek committed
950
        tmp2 = _AVX_MUL(h1_imag, x2);
951
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
952
        x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));