complex_avx-avx2_2hv_template.c 60.7 KB
Newer Older
1 2 3 4 5 6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
Andreas Marek's avatar
Andreas Marek committed
7
//        Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
Andreas Marek's avatar
Andreas Marek committed
9
//        Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
//        Schwerpunkt Wissenschaftliches Rechnen ,
12 13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14 15
//        Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//        and
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
Andreas Marek's avatar
Andreas Marek committed
36
//    along with ELPA.        If not, see <http://www.gnu.org/licenses/>
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>
66
#include <stdio.h>
67
#include <stdlib.h>
68 69 70 71 72 73 74 75 76 77 78 79 80

#define __forceinline __attribute__((always_inline))

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_MUL _mm256_mul_pd
#define _AVX_ADDSUB _mm256_addsub_pd
#define _AVX_XOR _mm256_xor_pd
#define _AVX_BROADCAST _mm256_broadcast_sd
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
81
#define _AVX_SET1 _mm256_set1_pd
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
#define _AVX_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5
#define _CAST _mm256_castpd256_pd128

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif

#define _AVX_FMADDSUB _mm256_FMADDSUB_pd
#define _AVX_FMSUBADD _mm256_FMSUBADD_pd
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

#ifdef SINGLE_PRECISION_COMPLEX
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_MUL _mm256_mul_ps
#define _AVX_ADDSUB _mm256_addsub_ps
#define _AVX_XOR _mm256_xor_ps
#define _AVX_BROADCAST _mm256_broadcast_ss
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
115
#define _AVX_SET1 _mm256_set1_ps
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
#define _AVX_SHUFFLE _mm256_shuffle_ps
#define _SHUFFLE 0xb1
#define _CAST _mm256_castps256_ps128
#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif

#define _AVX_FMADDSUB _mm256_FMADDSUB_ps
#define _AVX_FMSUBADD _mm256_FMSUBADD_ps
#endif
#endif /* SINGLE_PRECISION_COMPLEX */

#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
//Forward declaration
147 148
static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
149 150 151 152 153 154 155 156 157
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
158 159 160 161 162 163
!f>                                bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        ! complex(kind=c_double_complex)     :: q(*)
!f>        type(c_ptr), value                     :: q
!f>        complex(kind=c_double_complex)           :: hh(pnb,2)
164 165 166 167 168 169 170 171 172 173
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
174 175 176 177 178 179
!f>                                bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        ! complex(kind=c_float_complex)   :: q(*)
!f>        type(c_ptr), value                  :: q
!f>        complex(kind=c_float_complex)        :: hh(pnb,2)
180 181 182 183 184 185 186 187 188 189 190 191 192
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx_avx2_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
193 194 195 196 197 198
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
199

Andreas Marek's avatar
Andreas Marek committed
200
        worked_on = 0;
201 202

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
203
        double complex s = conj(hh[(ldh)+1])*1.0;
204 205
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
206
        float complex s = conj(hh[(ldh)+1])*1.0f;
207 208
#endif

Andreas Marek's avatar
Andreas Marek committed
209 210 211 212
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * conj(hh[(i+ldh)]);
        }
213 214

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
215 216 217 218 219
        for (i = 0; i < nq-6; i+=8)
        {
                hh_trafo_complex_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
220 221
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
222 223 224 225 226 227 228 229 230
        for (i = 0; i < nq-12; i+=16)
        {
                hh_trafo_complex_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s , s);
                worked_on += 16;
        }
#endif
        if (nq-i == 0) {
          return;
        }
231
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
232 233 234 235
        if (nq-i == 6) {
                hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 6;
        }
236 237
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
238 239 240 241
        if (nq-i == 12) {
                hh_trafo_complex_kernel_12_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
                worked_on += 12;
        }
242 243
#endif

244
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
245 246 247 248
        if (nq-i == 4) {
                hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 4;
        }
249 250
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
251 252 253 254
        if (nq-i == 8) {
                hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
                worked_on += 8;
        }
255 256
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
257 258 259 260
        if (nq-i == 2) {
                hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 2;
        }
261 262
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
263 264 265 266
        if (nq-i == 4) {
                hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
                worked_on += 4;
        }
267
#endif
268
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
269 270 271 272
        if (worked_on != nq) {
                printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
                abort();
        }
273
#endif
274 275 276 277 278 279
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
280
static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
281 282 283 284
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
285 286 287
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
288 289
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
290 291 292
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
293
#endif
Andreas Marek's avatar
Andreas Marek committed
294 295 296 297 298 299
        __AVX_DATATYPE x1, x2, x3, x4;
        __AVX_DATATYPE y1, y2, y3, y4;
        __AVX_DATATYPE q1, q2, q3, q4;
        __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
300 301

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
302
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
303 304
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
305
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
306 307
#endif

Andreas Marek's avatar
Andreas Marek committed
308 309 310 311 312 313
        x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
        x2 = _AVX_LOAD(&q_dbl[(2*ldq)+offset]);
        x3 = _AVX_LOAD(&q_dbl[(2*ldq)+2*offset]);
        x4 = _AVX_LOAD(&q_dbl[(2*ldq)+3*offset]);
        h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
314
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
315 316
        // conjugate
        h2_imag = _AVX_XOR(h2_imag, sign);
317 318
#endif

Andreas Marek's avatar
Andreas Marek committed
319 320 321 322
        y1 = _AVX_LOAD(&q_dbl[0]);
        y2 = _AVX_LOAD(&q_dbl[offset]);
        y3 = _AVX_LOAD(&q_dbl[2*offset]);
        y4 = _AVX_LOAD(&q_dbl[3*offset]);
323

Andreas Marek's avatar
Andreas Marek committed
324
        tmp1 = _AVX_MUL(h2_imag, x1);
325
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
326
        y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
327
#else
Andreas Marek's avatar
Andreas Marek committed
328
        y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
329
#endif
Andreas Marek's avatar
Andreas Marek committed
330
        tmp2 = _AVX_MUL(h2_imag, x2);
331
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
332
        y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
333
#else
Andreas Marek's avatar
Andreas Marek committed
334
        y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
335 336
#endif

Andreas Marek's avatar
Andreas Marek committed
337
        tmp3 = _AVX_MUL(h2_imag, x3);
338
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
339
        y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
340
#else
Andreas Marek's avatar
Andreas Marek committed
341
        y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
342
#endif
Andreas Marek's avatar
Andreas Marek committed
343
        tmp4 = _AVX_MUL(h2_imag, x4);
344
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
345
        y4 = _AVX_ADD(y4, _AVX_FMSUBADD(h2_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
346
#else
Andreas Marek's avatar
Andreas Marek committed
347
        y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
348 349
#endif

Andreas Marek's avatar
Andreas Marek committed
350 351 352 353 354 355
        for (i = 2; i < nb; i++)
        {
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
356

Andreas Marek's avatar
Andreas Marek committed
357 358
                h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
359
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
360 361
                // conjugate
                h1_imag = _AVX_XOR(h1_imag, sign);
362 363
#endif

Andreas Marek's avatar
Andreas Marek committed
364
                tmp1 = _AVX_MUL(h1_imag, q1);
365
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
366
                x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
367
#else
Andreas Marek's avatar
Andreas Marek committed
368
                x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
369
#endif
Andreas Marek's avatar
Andreas Marek committed
370
                tmp2 = _AVX_MUL(h1_imag, q2);
371
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
372
                x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
373
#else
Andreas Marek's avatar
Andreas Marek committed
374
                x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
375 376
#endif

Andreas Marek's avatar
Andreas Marek committed
377
                tmp3 = _AVX_MUL(h1_imag, q3);
378
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
379
                x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
380
#else
Andreas Marek's avatar
Andreas Marek committed
381
                x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
382
#endif
Andreas Marek's avatar
Andreas Marek committed
383
                tmp4 = _AVX_MUL(h1_imag, q4);
384
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
385
                x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
386
#else
Andreas Marek's avatar
Andreas Marek committed
387
                x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
388 389
#endif

Andreas Marek's avatar
Andreas Marek committed
390 391
                h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
392
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
393 394
                // conjugate
                h2_imag = _AVX_XOR(h2_imag, sign);
395 396
#endif

Andreas Marek's avatar
Andreas Marek committed
397
                tmp1 = _AVX_MUL(h2_imag, q1);
398
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
399
                y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
400
#else
Andreas Marek's avatar
Andreas Marek committed
401
                y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
402
#endif
Andreas Marek's avatar
Andreas Marek committed
403
                tmp2 = _AVX_MUL(h2_imag, q2);
404
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
405
                y2 = _AVX_ADD(y2, _AVX_FMSUBADD(h2_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
406
#else
Andreas Marek's avatar
Andreas Marek committed
407
                y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
408 409
#endif

Andreas Marek's avatar
Andreas Marek committed
410
                tmp3 = _AVX_MUL(h2_imag, q3);
411
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
412
                y3 = _AVX_ADD(y3, _AVX_FMSUBADD(h2_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
413
#else
Andreas Marek's avatar
Andreas Marek committed
414
                y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
415
#endif
Andreas Marek's avatar
Andreas Marek committed
416
                tmp4 = _AVX_MUL(h2_imag, q4);
417
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
418
                y4 = _AVX_ADD(y4, _AVX_FMSUBADD(h2_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
419
#else
Andreas Marek's avatar
Andreas Marek committed
420
                y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
421
#endif
Andreas Marek's avatar
Andreas Marek committed
422
        }
423

Andreas Marek's avatar
Andreas Marek committed
424 425
        h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
426
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
427 428
        // conjugate
        h1_imag = _AVX_XOR(h1_imag, sign);
429 430
#endif

Andreas Marek's avatar
Andreas Marek committed
431 432 433 434
        q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
435

Andreas Marek's avatar
Andreas Marek committed
436
        tmp1 = _AVX_MUL(h1_imag, q1);
437
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
438
        x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
439
#else
Andreas Marek's avatar
Andreas Marek committed
440
        x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
441
#endif
Andreas Marek's avatar
Andreas Marek committed
442
        tmp2 = _AVX_MUL(h1_imag, q2);
443
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
444
        x2 = _AVX_ADD(x2, _AVX_FMSUBADD(h1_real, q2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
445
#else
Andreas Marek's avatar
Andreas Marek committed
446
        x2 = _AVX_ADD(x2, _AVX_ADDSUB( _AVX_MUL(h1_real, q2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
447 448
#endif

Andreas Marek's avatar
Andreas Marek committed
449
        tmp3 = _AVX_MUL(h1_imag, q3);
450
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
451
        x3 = _AVX_ADD(x3, _AVX_FMSUBADD(h1_real, q3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
452
#else
Andreas Marek's avatar
Andreas Marek committed
453
        x3 = _AVX_ADD(x3, _AVX_ADDSUB( _AVX_MUL(h1_real, q3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
454
#endif
Andreas Marek's avatar
Andreas Marek committed
455
        tmp4 = _AVX_MUL(h1_imag, q4);
456
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
457
        x4 = _AVX_ADD(x4, _AVX_FMSUBADD(h1_real, q4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
458
#else
Andreas Marek's avatar
Andreas Marek committed
459
        x4 = _AVX_ADD(x4, _AVX_ADDSUB( _AVX_MUL(h1_real, q4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
460 461
#endif

Andreas Marek's avatar
Andreas Marek committed
462 463 464 465
        h1_real = _AVX_BROADCAST(&hh_dbl[0]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
466

Andreas Marek's avatar
Andreas Marek committed
467
        tmp1 = _AVX_MUL(h1_imag, x1);
468
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
469
        x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
470
#else
Andreas Marek's avatar
Andreas Marek committed
471
        x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
472
#endif
Andreas Marek's avatar
Andreas Marek committed
473
        tmp2 = _AVX_MUL(h1_imag, x2);
474
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
475
        x2 = _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
476
#else
Andreas Marek's avatar
Andreas Marek committed
477
        x2 = _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
478 479
#endif

Andreas Marek's avatar
Andreas Marek committed
480
        tmp3 = _AVX_MUL(h1_imag, x3);
481
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
482
        x3 = _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
483
#else
Andreas Marek's avatar
Andreas Marek committed
484
        x3 = _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
485
#endif
Andreas Marek's avatar
Andreas Marek committed
486
        tmp4 = _AVX_MUL(h1_imag, x4);
487
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
488
        x4 = _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
489
#else
Andreas Marek's avatar
Andreas Marek committed
490
        x4 = _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
491 492
#endif

Andreas Marek's avatar
Andreas Marek committed
493 494 495 496
        h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
        h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
        h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
497

Andreas Marek's avatar
Andreas Marek committed
498 499 500 501
        h1_real = _AVX_XOR(h1_real, sign);
        h1_imag = _AVX_XOR(h1_imag, sign);
        h2_real = _AVX_XOR(h2_real, sign);
        h2_imag = _AVX_XOR(h2_imag, sign);
502 503

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
504
        tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
505 506
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
507 508
        tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
509 510
#endif

Andreas Marek's avatar
Andreas Marek committed
511
        tmp1 = _AVX_MUL(h2_imag, tmp2);
512
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
513
        tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
514
#else
Andreas Marek's avatar
Andreas Marek committed
515
        tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
516 517
#endif

Andreas Marek's avatar
Andreas Marek committed
518 519
        h2_real = _AVX_SET1(tmp2[0]);
        h2_imag = _AVX_SET1(tmp2[1]);
520

Andreas Marek's avatar
Andreas Marek committed
521
        tmp1 = _AVX_MUL(h1_imag, y1);
522
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
523
        y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
524
#else
Andreas Marek's avatar
Andreas Marek committed
525
        y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
526
#endif
Andreas Marek's avatar
Andreas Marek committed
527
        tmp2 = _AVX_MUL(h1_imag, y2);
528
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
529
        y2 = _AVX_FMADDSUB(h1_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
530
#else
Andreas Marek's avatar
Andreas Marek committed
531
        y2 = _AVX_ADDSUB( _AVX_MUL(h1_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE));
532 533
#endif

Andreas Marek's avatar
Andreas Marek committed
534
        tmp3 = _AVX_MUL(h1_imag, y3);
535
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
536
        y3 = _AVX_FMADDSUB(h1_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
537
#else
Andreas Marek's avatar
Andreas Marek committed
538
        y3 = _AVX_ADDSUB( _AVX_MUL(h1_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE));
539
#endif
Andreas Marek's avatar
Andreas Marek committed
540
        tmp4 = _AVX_MUL(h1_imag, y4);
541
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
542
        y4 = _AVX_FMADDSUB(h1_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
543
#else
Andreas Marek's avatar
Andreas Marek committed
544
        y4 = _AVX_ADDSUB( _AVX_MUL(h1_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE));
545 546
#endif

Andreas Marek's avatar
Andreas Marek committed
547
        tmp1 = _AVX_MUL(h2_imag, x1);
548
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
549
        y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
550
#else
Andreas Marek's avatar
Andreas Marek committed
551
        y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
552
#endif
Andreas Marek's avatar
Andreas Marek committed
553
        tmp2 = _AVX_MUL(h2_imag, x2);
554
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
555
        y2 = _AVX_ADD(y2, _AVX_FMADDSUB(h2_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
556
#else
Andreas Marek's avatar
Andreas Marek committed
557
        y2 = _AVX_ADD(y2, _AVX_ADDSUB( _AVX_MUL(h2_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
558 559
#endif

Andreas Marek's avatar
Andreas Marek committed
560
        tmp3 = _AVX_MUL(h2_imag, x3);
561
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
562
        y3 = _AVX_ADD(y3, _AVX_FMADDSUB(h2_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
563
#else
Andreas Marek's avatar
Andreas Marek committed
564
        y3 = _AVX_ADD(y3, _AVX_ADDSUB( _AVX_MUL(h2_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
565
#endif
Andreas Marek's avatar
Andreas Marek committed
566
        tmp4 = _AVX_MUL(h2_imag, x4);
567
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
568
        y4 = _AVX_ADD(y4, _AVX_FMADDSUB(h2_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
569
#else
Andreas Marek's avatar
Andreas Marek committed
570
        y4 = _AVX_ADD(y4, _AVX_ADDSUB( _AVX_MUL(h2_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
571 572
#endif

Andreas Marek's avatar
Andreas Marek committed
573 574 575 576
        q1 = _AVX_LOAD(&q_dbl[0]);
        q2 = _AVX_LOAD(&q_dbl[offset]);
        q3 = _AVX_LOAD(&q_dbl[2*offset]);
        q4 = _AVX_LOAD(&q_dbl[3*offset]);
577

Andreas Marek's avatar
Andreas Marek committed
578 579 580 581
        q1 = _AVX_ADD(q1, y1);
        q2 = _AVX_ADD(q2, y2);
        q3 = _AVX_ADD(q3, y3);
        q4 = _AVX_ADD(q4, y4);
582 583


Andreas Marek's avatar
Andreas Marek committed
584 585 586 587
        _AVX_STORE(&q_dbl[0], q1);
        _AVX_STORE(&q_dbl[offset], q2);
        _AVX_STORE(&q_dbl[2*offset], q3);
        _AVX_STORE(&q_dbl[3*offset], q4);
588

Andreas Marek's avatar
Andreas Marek committed
589 590
        h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
591

Andreas Marek's avatar
Andreas Marek committed
592 593 594 595
        q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
        q2 = _AVX_LOAD(&q_dbl[(ldq*2)+offset]);
        q3 = _AVX_LOAD(&q_dbl[(ldq*2)+2*offset]);
        q4 = _AVX_LOAD(&q_dbl[(ldq*2)+3*offset]);
596

Andreas Marek's avatar
Andreas Marek committed
597 598 599 600
        q1 = _AVX_ADD(q1, x1);
        q2 = _AVX_ADD(q2, x2);
        q3 = _AVX_ADD(q3, x3);
        q4 = _AVX_ADD(q4, x4);
601

Andreas Marek's avatar
Andreas Marek committed
602
        tmp1 = _AVX_MUL(h2_imag, y1);
603
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
604
        q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
605
#else
Andreas Marek's avatar
Andreas Marek committed
606
        q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
607
#endif
Andreas Marek's avatar
Andreas Marek committed
608
        tmp2 = _AVX_MUL(h2_imag, y2);
609
#ifdef __ELPA_USE_FMA_
Andreas Marek's avatar
Andreas Marek committed
610
        q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
611
#else
Andreas Marek's avatar
Andreas Marek committed
612
        q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
613 614
#endif

Andreas Marek's avatar
Andreas Marek committed
615
        tmp3 = _AVX_MUL(h2_imag, y3);
616
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
617
        q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
618
#else
Andreas Marek's avatar
Andreas Marek committed
619
        q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
620
#endif
Andreas Marek's avatar
Andreas Marek committed
621
        tmp4 = _AVX_MUL(h2_imag, y4);
622
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
623
        q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h2_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
624
#else
Andreas Marek's avatar
Andreas Marek committed
625
        q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h2_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
626 627
#endif

Andreas Marek's avatar
Andreas Marek committed
628 629 630 631
        _AVX_STORE(&q_dbl[(ldq*2)+0], q1);
        _AVX_STORE(&q_dbl[(ldq*2)+offset], q2);
        _AVX_STORE(&q_dbl[(ldq*2)+2*offset], q3);
        _AVX_STORE(&q_dbl[(ldq*2)+3*offset], q4);
632

Andreas Marek's avatar
Andreas Marek committed
633 634
        for (i = 2; i < nb; i++)
        {
635

Andreas Marek's avatar
Andreas Marek committed
636 637 638 639
                q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _AVX_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
640

Andreas Marek's avatar
Andreas Marek committed
641 642
                h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
                h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
643

Andreas Marek's avatar
Andreas Marek committed
644
                tmp1 = _AVX_MUL(h1_imag, x1);
645
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
646
                q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
647
#else
Andreas Marek's avatar
Andreas Marek committed
648
                q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
649
#endif
Andreas Marek's avatar
Andreas Marek committed
650
                tmp2 = _AVX_MUL(h1_imag, x2);
651
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
652
                q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
653
#else
Andreas Marek's avatar
Andreas Marek committed
654
                q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
655 656
#endif

Andreas Marek's avatar
Andreas Marek committed
657
                tmp3 = _AVX_MUL(h1_imag, x3);
658
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
659
                q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
660
#else
Andreas Marek's avatar
Andreas Marek committed
661
                q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
662
#endif
Andreas Marek's avatar
Andreas Marek committed
663
                tmp4 = _AVX_MUL(h1_imag, x4);
664
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
665
                q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
666
#else
Andreas Marek's avatar
Andreas Marek committed
667
                q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
668 669
#endif

Andreas Marek's avatar
Andreas Marek committed
670 671
                h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
                h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
672

Andreas Marek's avatar
Andreas Marek committed
673
                tmp1 = _AVX_MUL(h2_imag, y1);
674
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
675
                q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
676
#else
Andreas Marek's avatar
Andreas Marek committed
677
                q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
678
#endif
Andreas Marek's avatar
Andreas Marek committed
679
                tmp2 = _AVX_MUL(h2_imag, y2);
680
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
681
                q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h2_real, y2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
682
#else
Andreas Marek's avatar
Andreas Marek committed
683
                q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h2_real, y2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
684 685
#endif

Andreas Marek's avatar
Andreas Marek committed
686
                tmp3 = _AVX_MUL(h2_imag, y3);
687
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
688
                q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h2_real, y3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
689
#else
Andreas Marek's avatar
Andreas Marek committed
690
                q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h2_real, y3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
691
#endif
Andreas Marek's avatar
Andreas Marek committed
692
                tmp4 = _AVX_MUL(h2_imag, y4);
693
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
694
                q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h2_real, y4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
695
#else
Andreas Marek's avatar
Andreas Marek committed
696
                q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h2_real, y4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
697 698
#endif

Andreas Marek's avatar
Andreas Marek committed
699 700 701 702 703 704 705
                _AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _AVX_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _AVX_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _AVX_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
        }
        h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
        h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
706

Andreas Marek's avatar
Andreas Marek committed
707 708 709 710
        q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
        q2 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+offset]);
        q3 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
        q4 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
711

Andreas Marek's avatar
Andreas Marek committed
712
        tmp1 = _AVX_MUL(h1_imag, x1);
713
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
714
        q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
715
#else
Andreas Marek's avatar
Andreas Marek committed
716
        q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
717
#endif
Andreas Marek's avatar
Andreas Marek committed
718
        tmp2 = _AVX_MUL(h1_imag, x2);
719
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
720
        q2 = _AVX_ADD(q2, _AVX_FMADDSUB(h1_real, x2, _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
721
#else
Andreas Marek's avatar
Andreas Marek committed
722
        q2 = _AVX_ADD(q2, _AVX_ADDSUB( _AVX_MUL(h1_real, x2), _AVX_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
723 724
#endif

Andreas Marek's avatar
Andreas Marek committed
725
        tmp3 = _AVX_MUL(h1_imag, x3);
726
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
727
        q3 = _AVX_ADD(q3, _AVX_FMADDSUB(h1_real, x3, _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
728
#else
Andreas Marek's avatar
Andreas Marek committed
729
        q3 = _AVX_ADD(q3, _AVX_ADDSUB( _AVX_MUL(h1_real, x3), _AVX_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
730
#endif
Andreas Marek's avatar
Andreas Marek committed
731
        tmp4 = _AVX_MUL(h1_imag, x4);
732
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
733
        q4 = _AVX_ADD(q4, _AVX_FMADDSUB(h1_real, x4, _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
734
#else
Andreas Marek's avatar
Andreas Marek committed
735
        q4 = _AVX_ADD(q4, _AVX_ADDSUB( _AVX_MUL(h1_real, x4), _AVX_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
736 737
#endif

Andreas Marek's avatar
Andreas Marek committed
738 739 740 741
        _AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
        _AVX_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
        _AVX_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
        _AVX_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
742 743 744 745
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
746 747 748 749 750
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
#endif

751
{
752
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
753 754 755
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
        double* s_dbl = (double*)(&s);
756 757
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
758 759 760
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
        float* s_dbl = (float*)(&s);
761
#endif
Andreas Marek's avatar
Andreas Marek committed
762 763 764 765 766 767
        __AVX_DATATYPE x1, x2, x3;
        __AVX_DATATYPE y1, y2, y3;
        __AVX_DATATYPE q1, q2, q3;
        __AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
        __AVX_DATATYPE tmp1, tmp2, tmp3;
        int i=0;
768

769
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
770
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
771 772
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
773
        __AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
774
#endif
Andreas Marek's avatar
Andreas Marek committed
775 776 777
        x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
        x2 = _AVX_LOAD(&q_dbl[(2*ldq)+offset]);
        x3 = _AVX_LOAD(&q_dbl[(2*ldq)+2*offset]);
778

Andreas Marek's avatar
Andreas Marek committed
779 780
        h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
        h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
781
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
782 783
        // conjugate
        h2_imag = _AVX_XOR(h2_imag, sign);
784 785
#endif

Andreas Marek's avatar
Andreas Marek committed
786 787 788
        y1 = _AVX_LOAD(&q_dbl[0]);
        y2 = _AVX_LOAD(&q_dbl[offset]);
        y3 = _AVX_LOAD(&q_dbl[2*offset]);
789

Andreas Marek's avatar
Andreas Marek committed
790
        tmp1 = _AVX_MUL(h2_imag, x1);
791
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
792
        y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
793
#else
Andreas Marek's avatar
Andreas Marek committed
794