real_avx-avx2_4hv_template.c 59.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <x86intrin.h>
65 66
#include <stdio.h>
#include <stdlib.h>
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192

#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_SUB _mm256_sub_pd
#define _AVX_MUL _mm256_mul_pd
#define _AVX_BROADCAST _mm256_broadcast_sd

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
#error "This should be prop _mm256_msub_pd instead of _mm256_msub"
#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif

#endif

#define _AVX_FMA _mm256_FMA_pd
#define _AVX_NFMA _mm256_NFMA_pd
#define _AVX_FMSUB _mm256_FMSUB_pd
#endif /* DOUBLE_PRECISION */

#ifdef SINGLE_PRECISION_REAL
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_SUB _mm256_sub_ps
#define _AVX_MUL _mm256_mul_ps
#define _AVX_BROADCAST _mm256_broadcast_ss

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
#error "This should be prop _mm256_msub_ps instead of _mm256_msub"
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif

#endif
#define _AVX_FMA _mm256_FMA_ps
#define _AVX_NFMA _mm256_NFMA_ps
#define _AVX_FMSUB _mm256_FMSUB_ps
#endif /* SINGLE_PRECISION */

#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);

void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);

void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine quad_hh_trafo_real_avx_avx2_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="quad_hh_trafo_real_avx_avx2_4hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine quad_hh_trafo_real_avx_avx2_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="quad_hh_trafo_real_avx_avx2_4hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_float)      :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif


#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
193 194 195 196 197 198
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
199

Andreas Marek's avatar
Andreas Marek committed
200
        worked_on = 0;
201

Andreas Marek's avatar
Andreas Marek committed
202 203
        // calculating scalar products to compute
        // 4 householder vectors simultaneously
204
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
205 206 207 208 209 210
        double s_1_2 = hh[(ldh)+1];
        double s_1_3 = hh[(ldh*2)+2];
        double s_2_3 = hh[(ldh*2)+1];
        double s_1_4 = hh[(ldh*3)+3];
        double s_2_4 = hh[(ldh*3)+2];
        double s_3_4 = hh[(ldh*3)+1];
211 212
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
        float s_1_2 = hh[(ldh)+1];
        float s_1_3 = hh[(ldh*2)+2];
        float s_2_3 = hh[(ldh*2)+1];
        float s_1_4 = hh[(ldh*3)+3];
        float s_2_4 = hh[(ldh*3)+2];
        float s_3_4 = hh[(ldh*3)+1];
#endif

        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
        s_1_2 += hh[2-1] * hh[(2+ldh)];
        s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
        s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];

        // loop counter = 3
        s_1_2 += hh[3-1] * hh[(3+ldh)];
        s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
        s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];

        s_1_3 += hh[3-2] * hh[3+(ldh*2)];
        s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];

        #pragma ivdep
        for (i = 4; i < nb; i++)
        {
                s_1_2 += hh[i-1] * hh[(i+ldh)];
                s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
                s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];

                s_1_3 += hh[i-2] * hh[i+(ldh*2)];
                s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];

                s_1_4 += hh[i-3] * hh[i+(ldh*3)];
        }

        // Production level kernel calls with padding
249
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
250 251 252 253 254
        for (i = 0; i < nq-8; i+=12)
        {
                hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 12;
        }
255 256
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
257 258 259 260 261
        for (i = 0; i < nq-16; i+=24)
        {
                hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 24;
        }
262
#endif
Andreas Marek's avatar
Andreas Marek committed
263 264 265 266
        if (nq == i)
        {
                return;
        }
267

268
#ifdef DOUBLE_PRECISION_REAL
269
        if (nq-i == 8)
Andreas Marek's avatar
Andreas Marek committed
270 271 272 273
        {
                hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 8;
        }
274
#endif
275

276
#ifdef SINGLE_PRECISION_REAL
277
        if (nq-i == 16)
Andreas Marek's avatar
Andreas Marek committed
278 279 280 281
        {
                hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 16;
        }
282 283 284 285
#endif

#ifdef DOUBLE_PRECISION_REAL
        if (nq-i == 4)
Andreas Marek's avatar
Andreas Marek committed
286 287 288 289
        {
                hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 4;
        }
290
#endif
291 292 293

#ifdef SINGLE_PRECISION_REAL
        if (nq-i == 8)
Andreas Marek's avatar
Andreas Marek committed
294 295 296 297
        {
                hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
                worked_on += 8;
        }
298 299
#endif

300
#ifdef WITH_DEBUG
301
        if (worked_on != nq)
Andreas Marek's avatar
Andreas Marek committed
302 303 304 305
        {
            printf("Error in real AVX/AVX2 BLOCK4 kernel \n");
            abort();
        }
306
#endif
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
}
/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 12 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 24 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [12 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;

        __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*3]);
        __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*2]);
        __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq]);
        __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[0]);

        __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
        __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
        __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
        __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
        __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
        __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);

#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
        w1 = _AVX_FMA(a2_1, h_4_2, w1);
        w1 = _AVX_FMA(a1_1, h_4_1, w1);
        register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
        z1 = _AVX_FMA(a1_1, h_3_1, z1);
        register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
        register __AVX_DATATYPE x1 = a1_1;
#else
        register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
        w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
        w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
        register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
        z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
        register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
        register __AVX_DATATYPE x1 = a1_1;
360
#endif
Andreas Marek's avatar
Andreas Marek committed
361 362 363 364 365

        __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
        __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
        __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[ldq+offset]);
        __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[0+offset]);
366 367

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
368 369 370 371 372 373 374
        register __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
        w2 = _AVX_FMA(a2_2, h_4_2, w2);
        w2 = _AVX_FMA(a1_2, h_4_1, w2);
        register __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
        z2 = _AVX_FMA(a1_2, h_3_1, z2);
        register __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
        register __AVX_DATATYPE x2 = a1_2;
375
#else
Andreas Marek's avatar
Andreas Marek committed
376 377 378 379 380 381 382
        register __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
        w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
        w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
        register __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
        z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
        register __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
        register __AVX_DATATYPE x2 = a1_2;
383 384
#endif

Andreas Marek's avatar
Andreas Marek committed
385 386 387 388
        __AVX_DATATYPE a1_3 = _AVX_LOAD(&q[(ldq*3)+2*offset]);
        __AVX_DATATYPE a2_3 = _AVX_LOAD(&q[(ldq*2)+2*offset]);
        __AVX_DATATYPE a3_3 = _AVX_LOAD(&q[ldq+2*offset]);
        __AVX_DATATYPE a4_3 = _AVX_LOAD(&q[0+2*offset]);
389 390

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
391 392 393 394 395 396 397
        register __AVX_DATATYPE w3 = _AVX_FMA(a3_3, h_4_3, a4_3);
        w3 = _AVX_FMA(a2_3, h_4_2, w3);
        w3 = _AVX_FMA(a1_3, h_4_1, w3);
        register __AVX_DATATYPE z3 = _AVX_FMA(a2_3, h_3_2, a3_3);
        z3 = _AVX_FMA(a1_3, h_3_1, z3);
        register __AVX_DATATYPE y3 = _AVX_FMA(a1_3, h_2_1, a2_3);
        register __AVX_DATATYPE x3 = a1_3;
398
#else
Andreas Marek's avatar
Andreas Marek committed
399 400 401 402 403 404 405
        register __AVX_DATATYPE w3 = _AVX_ADD(a4_3, _AVX_MUL(a3_3, h_4_3));
        w3 = _AVX_ADD(w3, _AVX_MUL(a2_3, h_4_2));
        w3 = _AVX_ADD(w3, _AVX_MUL(a1_3, h_4_1));
        register __AVX_DATATYPE z3 = _AVX_ADD(a3_3, _AVX_MUL(a2_3, h_3_2));
        z3 = _AVX_ADD(z3, _AVX_MUL(a1_3, h_3_1));
        register __AVX_DATATYPE y3 = _AVX_ADD(a2_3, _AVX_MUL(a1_3, h_2_1));
        register __AVX_DATATYPE x3 = a1_3;
406 407
#endif

Andreas Marek's avatar
Andreas Marek committed
408 409 410
        __AVX_DATATYPE q1;
        __AVX_DATATYPE q2;
        __AVX_DATATYPE q3;
411

Andreas Marek's avatar
Andreas Marek committed
412 413 414 415
        __AVX_DATATYPE h1;
        __AVX_DATATYPE h2;
        __AVX_DATATYPE h3;
        __AVX_DATATYPE h4;
416

Andreas Marek's avatar
Andreas Marek committed
417 418 419 420 421 422
        for(i = 4; i < nb; i++)
        {
                h1 = _AVX_BROADCAST(&hh[i-3]);
                q1 = _AVX_LOAD(&q[i*ldq]);
                q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
                q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
423
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
424 425 426
                x1 = _AVX_FMA(q1, h1, x1);
                x2 = _AVX_FMA(q2, h1, x2);
                x3 = _AVX_FMA(q3, h1, x3);
427
#else
Andreas Marek's avatar
Andreas Marek committed
428 429 430
                x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
                x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
                x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
431 432
#endif

Andreas Marek's avatar
Andreas Marek committed
433
                h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
434
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
435 436 437
                y1 = _AVX_FMA(q1, h2, y1);
                y2 = _AVX_FMA(q2, h2, y2);
                y3 = _AVX_FMA(q3, h2, y3);
438
#else
Andreas Marek's avatar
Andreas Marek committed
439 440 441
                y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
                y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
                y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
442 443
#endif

Andreas Marek's avatar
Andreas Marek committed
444
                h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
445
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
446 447 448
                z1 = _AVX_FMA(q1, h3, z1);
                z2 = _AVX_FMA(q2, h3, z2);
                z3 = _AVX_FMA(q3, h3, z3);
449
#else
Andreas Marek's avatar
Andreas Marek committed
450 451 452
                z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
                z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
                z3 = _AVX_ADD(z3, _AVX_MUL(q3,h3));
453 454
#endif

Andreas Marek's avatar
Andreas Marek committed
455
                h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
456
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
457 458 459
                w1 = _AVX_FMA(q1, h4, w1);
                w2 = _AVX_FMA(q2, h4, w2);
                w3 = _AVX_FMA(q3, h4, w3);
460
#else
Andreas Marek's avatar
Andreas Marek committed
461 462 463
                w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
                w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
                w3 = _AVX_ADD(w3, _AVX_MUL(q3,h4));
464
#endif
Andreas Marek's avatar
Andreas Marek committed
465
        }
466

Andreas Marek's avatar
Andreas Marek committed
467
        h1 = _AVX_BROADCAST(&hh[nb-3]);
468

Andreas Marek's avatar
Andreas Marek committed
469 470 471
        q1 = _AVX_LOAD(&q[nb*ldq]);
        q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
        q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
472 473

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
474 475 476
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
        x3 = _AVX_FMA(q3, h1, x3);
477
#else
Andreas Marek's avatar
Andreas Marek committed
478 479 480
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
        x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
481 482
#endif

Andreas Marek's avatar
Andreas Marek committed
483
        h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
484
#ifdef __FMA4_
Andreas Marek's avatar
Andreas Marek committed
485 486 487
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
        y3 = _AVX_FMA(q3, h2, y3);
488
#else
Andreas Marek's avatar
Andreas Marek committed
489 490 491
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
        y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
492 493
#endif

Andreas Marek's avatar
Andreas Marek committed
494
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
495
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
496 497 498
        z1 = _AVX_FMA(q1, h3, z1);
        z2 = _AVX_FMA(q2, h3, z2);
        z3 = _AVX_FMA(q3, h3, z3);
499
#else
Andreas Marek's avatar
Andreas Marek committed
500 501 502
        z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
        z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
        z3 = _AVX_ADD(z3, _AVX_MUL(q3,h3));
503 504
#endif

Andreas Marek's avatar
Andreas Marek committed
505
        h1 = _AVX_BROADCAST(&hh[nb-2]);
506

Andreas Marek's avatar
Andreas Marek committed
507 508 509
        q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
        q3 = _AVX_LOAD(&q[((nb+1)*ldq)+2*offset]);
510 511

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
512 513 514
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
        x3 = _AVX_FMA(q3, h1, x3);
515
#else
Andreas Marek's avatar
Andreas Marek committed
516 517 518
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
        x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
519 520
#endif

Andreas Marek's avatar
Andreas Marek committed
521
        h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
522 523

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
524 525 526
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
        y3 = _AVX_FMA(q3, h2, y3);
527
#else
Andreas Marek's avatar
Andreas Marek committed
528 529 530
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
        y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
531 532
#endif

Andreas Marek's avatar
Andreas Marek committed
533
        h1 = _AVX_BROADCAST(&hh[nb-1]);
534

Andreas Marek's avatar
Andreas Marek committed
535 536 537
        q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
        q3 = _AVX_LOAD(&q[((nb+2)*ldq)+2*offset]);
538 539

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
540 541 542
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
        x3 = _AVX_FMA(q3, h1, x3);
543
#else
Andreas Marek's avatar
Andreas Marek committed
544 545 546
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
        x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
547 548
#endif

Andreas Marek's avatar
Andreas Marek committed
549 550 551
        /////////////////////////////////////////////////////
        // Rank-1 update of Q [12 x nb+3]
        /////////////////////////////////////////////////////
552

Andreas Marek's avatar
Andreas Marek committed
553
        __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
554

Andreas Marek's avatar
Andreas Marek committed
555 556 557 558
        h1 = tau1;
        x1 = _AVX_MUL(x1, h1);
        x2 = _AVX_MUL(x2, h1);
        x3 = _AVX_MUL(x3, h1);
559

Andreas Marek's avatar
Andreas Marek committed
560 561
        __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
        __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&s_1_2);
562

Andreas Marek's avatar
Andreas Marek committed
563 564
        h1 = tau2;
        h2 = _AVX_MUL(h1, vs_1_2);
565
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
566 567 568
        y1 = _AVX_FMSUB(y1, h1, _AVX_MUL(x1,h2));
        y2 = _AVX_FMSUB(y2, h1, _AVX_MUL(x2,h2));
        y3 = _AVX_FMSUB(y3, h1, _AVX_MUL(x3,h2));
569
#else
Andreas Marek's avatar
Andreas Marek committed
570 571 572
        y1 = _AVX_SUB(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
        y2 = _AVX_SUB(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
        y3 = _AVX_SUB(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
573 574
#endif

Andreas Marek's avatar
Andreas Marek committed
575 576 577
        __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
        __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&s_1_3);
        __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&s_2_3);
578

Andreas Marek's avatar
Andreas Marek committed
579 580 581
        h1 = tau3;
        h2 = _AVX_MUL(h1, vs_1_3);
        h3 = _AVX_MUL(h1, vs_2_3);
582
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
583 584 585
        z1 = _AVX_FMSUB(z1, h1, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
        z2 = _AVX_FMSUB(z2, h1, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
        z3 = _AVX_FMSUB(z3, h1, _AVX_FMA(y3, h3, _AVX_MUL(x3,h2)));
586
#else
Andreas Marek's avatar
Andreas Marek committed
587 588 589
        z1 = _AVX_SUB(_AVX_MUL(z1,h1), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
        z2 = _AVX_SUB(_AVX_MUL(z2,h1), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
        z3 = _AVX_SUB(_AVX_MUL(z3,h1), _AVX_ADD(_AVX_MUL(y3,h3), _AVX_MUL(x3,h2)));
590 591
#endif

Andreas Marek's avatar
Andreas Marek committed
592 593 594 595
        __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
        __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&s_1_4);
        __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&s_2_4);
        __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&s_3_4);
596

Andreas Marek's avatar
Andreas Marek committed
597 598 599 600
        h1 = tau4;
        h2 = _AVX_MUL(h1, vs_1_4);
        h3 = _AVX_MUL(h1, vs_2_4);
        h4 = _AVX_MUL(h1, vs_3_4);
601
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
602 603 604
        w1 = _AVX_FMSUB(w1, h1, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
        w2 = _AVX_FMSUB(w2, h1, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
        w3 = _AVX_FMSUB(w3, h1, _AVX_FMA(z3, h4, _AVX_FMA(y3, h3, _AVX_MUL(x3,h2))));
605
#else
Andreas Marek's avatar
Andreas Marek committed
606 607 608
        w1 = _AVX_SUB(_AVX_MUL(w1,h1), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
        w2 = _AVX_SUB(_AVX_MUL(w2,h1), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
        w3 = _AVX_SUB(_AVX_MUL(w3,h1), _AVX_ADD(_AVX_MUL(z3,h4), _AVX_ADD(_AVX_MUL(y3,h3), _AVX_MUL(x3,h2))));
609 610
#endif

Andreas Marek's avatar
Andreas Marek committed
611 612 613 614 615 616 617 618 619
        q1 = _AVX_LOAD(&q[0]);
        q2 = _AVX_LOAD(&q[offset]);
        q3 = _AVX_LOAD(&q[2*offset]);
        q1 = _AVX_SUB(q1, w1);
        q2 = _AVX_SUB(q2, w2);
        q3 = _AVX_SUB(q3, w3);
        _AVX_STORE(&q[0],q1);
        _AVX_STORE(&q[offset],q2);
        _AVX_STORE(&q[2*offset],q3);
620

Andreas Marek's avatar
Andreas Marek committed
621 622 623 624
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
        q1 = _AVX_LOAD(&q[ldq]);
        q2 = _AVX_LOAD(&q[ldq+offset]);
        q3 = _AVX_LOAD(&q[ldq+2*offset]);
625
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
626 627 628
        q1 = _AVX_SUB(q1, _AVX_FMA(w1, h4, z1));
        q2 = _AVX_SUB(q2, _AVX_FMA(w2, h4, z2));
        q3 = _AVX_SUB(q3, _AVX_FMA(w3, h4, z3));
629
#else
Andreas Marek's avatar
Andreas Marek committed
630 631 632
        q1 = _AVX_SUB(q1, _AVX_ADD(z1, _AVX_MUL(w1, h4)));
        q2 = _AVX_SUB(q2, _AVX_ADD(z2, _AVX_MUL(w2, h4)));
        q3 = _AVX_SUB(q3, _AVX_ADD(z3, _AVX_MUL(w3, h4)));
633
#endif
Andreas Marek's avatar
Andreas Marek committed
634 635 636
        _AVX_STORE(&q[ldq],q1);
        _AVX_STORE(&q[ldq+offset],q2);
        _AVX_STORE(&q[ldq+2*offset],q3);
637

Andreas Marek's avatar
Andreas Marek committed
638 639 640 641 642 643 644
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
        q1 = _AVX_LOAD(&q[ldq*2]);
        q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
        q3 = _AVX_LOAD(&q[(ldq*2)+2*offset]);
        q1 = _AVX_SUB(q1, y1);
        q2 = _AVX_SUB(q2, y2);
        q3 = _AVX_SUB(q3, y3);
645
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
646 647 648
        q1 = _AVX_NFMA(w1, h4, q1);
        q2 = _AVX_NFMA(w2, h4, q2);
        q3 = _AVX_NFMA(w3, h4, q3);
649
#else
Andreas Marek's avatar
Andreas Marek committed
650 651 652
        q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
        q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
        q3 = _AVX_SUB(q3, _AVX_MUL(w3, h4));
653
#endif
Andreas Marek's avatar
Andreas Marek committed
654
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
655
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
656 657 658
        q1 = _AVX_NFMA(z1, h3, q1);
        q2 = _AVX_NFMA(z2, h3, q2);
        q3 = _AVX_NFMA(z3, h3, q3);
659
#else
Andreas Marek's avatar
Andreas Marek committed
660 661 662
        q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
        q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
        q3 = _AVX_SUB(q3, _AVX_MUL(z3, h3));
663
#endif
Andreas Marek's avatar
Andreas Marek committed
664 665 666
        _AVX_STORE(&q[ldq*2],q1);
        _AVX_STORE(&q[(ldq*2)+offset],q2);
        _AVX_STORE(&q[(ldq*2)+2*offset],q3);
667

Andreas Marek's avatar
Andreas Marek committed
668 669 670 671 672 673 674
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
        q1 = _AVX_LOAD(&q[ldq*3]);
        q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
        q3 = _AVX_LOAD(&q[(ldq*3)+2*offset]);
        q1 = _AVX_SUB(q1, x1);
        q2 = _AVX_SUB(q2, x2);
        q3 = _AVX_SUB(q3, x3);
675
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
676 677 678
        q1 = _AVX_NFMA(w1, h4, q1);
        q2 = _AVX_NFMA(w2, h4, q2);
        q3 = _AVX_NFMA(w3, h4, q3);
679
#else
Andreas Marek's avatar
Andreas Marek committed
680 681 682
        q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
        q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
        q3 = _AVX_SUB(q3, _AVX_MUL(w3, h4));
683
#endif
Andreas Marek's avatar
Andreas Marek committed
684
        h2 = _AVX_BROADCAST(&hh[ldh+1]);
685
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
686 687 688
        q1 = _AVX_NFMA(y1, h2, q1);
        q2 = _AVX_NFMA(y2, h2, q2);
        q3 = _AVX_NFMA(y3, h2, q3);
689
#else
Andreas Marek's avatar
Andreas Marek committed
690 691 692
        q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
        q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
        q3 = _AVX_SUB(q3, _AVX_MUL(y3, h2));
693
#endif
Andreas Marek's avatar
Andreas Marek committed
694
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
695
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
696 697 698
        q1 = _AVX_NFMA(z1, h3, q1);
        q2 = _AVX_NFMA(z2, h3, q2);
        q3 = _AVX_NFMA(z3, h3, q3);
699
#else
Andreas Marek's avatar
Andreas Marek committed
700 701 702
        q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
        q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
        q3 = _AVX_SUB(q3, _AVX_MUL(z3, h3));
703
#endif
Andreas Marek's avatar
Andreas Marek committed
704 705 706
        _AVX_STORE(&q[ldq*3], q1);
        _AVX_STORE(&q[(ldq*3)+offset], q2);
        _AVX_STORE(&q[(ldq*3)+2*offset], q3);
707

Andreas Marek's avatar
Andreas Marek committed
708 709 710
        for (i = 4; i < nb; i++)
        {
                h1 = _AVX_BROADCAST(&hh[i-3]);
711

Andreas Marek's avatar
Andreas Marek committed
712 713 714
                q1 = _AVX_LOAD(&q[i*ldq]);
                q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
                q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
715 716

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
717 718 719
                q1 = _AVX_NFMA(x1, h1, q1);
                q2 = _AVX_NFMA(x2, h1, q2);
                q3 = _AVX_NFMA(x3, h1, q3);
720
#else
Andreas Marek's avatar
Andreas Marek committed
721 722 723
                q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
                q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
                q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
724 725
#endif

Andreas Marek's avatar
Andreas Marek committed
726
                h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
727
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
728 729 730
                q1 = _AVX_NFMA(y1, h2, q1);
                q2 = _AVX_NFMA(y2, h2, q2);
                q3 = _AVX_NFMA(y3, h2, q3);
731
#else
Andreas Marek's avatar
Andreas Marek committed
732 733 734
                q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
                q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
                q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
735 736
#endif

Andreas Marek's avatar
Andreas Marek committed
737
                h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
738
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
739 740 741
                q1 = _AVX_NFMA(z1, h3, q1);
                q2 = _AVX_NFMA(z2, h3, q2);
                q3 = _AVX_NFMA(z3, h3, q3);
742
#else
Andreas Marek's avatar
Andreas Marek committed
743 744 745
                q1 = _AVX_SUB(q1, _AVX_MUL(z1,h3));
                q2 = _AVX_SUB(q2, _AVX_MUL(z2,h3));
                q3 = _AVX_SUB(q3, _AVX_MUL(z3,h3));
746 747
#endif

Andreas Marek's avatar
Andreas Marek committed
748
                h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
749
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
750 751 752
                q1 = _AVX_NFMA(w1, h4, q1);
                q2 = _AVX_NFMA(w2, h4, q2);
                q3 = _AVX_NFMA(w3, h4, q3);
753
#else
Andreas Marek's avatar
Andreas Marek committed
754 755 756
                q1 = _AVX_SUB(q1, _AVX_MUL(w1,h4));
                q2 = _AVX_SUB(q2, _AVX_MUL(w2,h4));
                q3 = _AVX_SUB(q3, _AVX_MUL(w3,h4));
757 758
#endif

Andreas Marek's avatar
Andreas Marek committed
759 760 761 762
                _AVX_STORE(&q[i*ldq],q1);
                _AVX_STORE(&q[(i*ldq)+offset],q2);
                _AVX_STORE(&q[(i*ldq)+2*offset],q3);
        }
763

Andreas Marek's avatar
Andreas Marek committed
764 765 766 767
        h1 = _AVX_BROADCAST(&hh[nb-3]);
        q1 = _AVX_LOAD(&q[nb*ldq]);
        q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
        q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
768
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
769 770 771
        q1 = _AVX_NFMA(x1, h1, q1);
        q2 = _AVX_NFMA(x2, h1, q2);
        q3 = _AVX_NFMA(x3, h1, q3);
772
#else
Andreas Marek's avatar
Andreas Marek committed
773 774 775
        q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
        q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
        q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
776
#endif
Andreas Marek's avatar
Andreas Marek committed
777
        h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
778
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
779 780 781
        q1 = _AVX_NFMA(y1, h2, q1);
        q2 = _AVX_NFMA(y2, h2, q2);
        q3 = _AVX_NFMA(y3, h2, q3);
782
#else
Andreas Marek's avatar
Andreas Marek committed
783 784 785
        q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
        q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
        q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
786
#endif
Andreas Marek's avatar
Andreas Marek committed
787
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
788
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
789 790 791
        q1 = _AVX_NFMA(z1, h3, q1);
        q2 = _AVX_NFMA(z2, h3, q2);
        q3 = _AVX_NFMA(z3, h3, q3);
792
#else
Andreas Marek's avatar
Andreas Marek committed
793 794 795
        q1 = _AVX_SUB(q1, _AVX_MUL(z1,h3));
        q2 = _AVX_SUB(q2, _AVX_MUL(z2,h3));
        q3 = _AVX_SUB(q3, _AVX_MUL(z3,h3));
796
#endif
Andreas Marek's avatar
Andreas Marek committed
797 798 799
        _AVX_STORE(&q[nb*ldq],q1);
        _AVX_STORE(&q[(nb*ldq)+offset],q2);
        _AVX_STORE(&q[(nb*ldq)+2*offset],q3);
800

Andreas Marek's avatar
Andreas Marek committed
801 802 803 804
        h1 = _AVX_BROADCAST(&hh[nb-2]);
        q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
        q3 = _AVX_LOAD(&q[((nb+1)*ldq)+2*offset]);
805
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
806 807 808
        q1 = _AVX_NFMA(x1, h1, q1);
        q2 = _AVX_NFMA(x2, h1, q2);
        q3 = _AVX_NFMA(x3, h1, q3);
809
#else
Andreas Marek's avatar
Andreas Marek committed
810 811 812
        q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
        q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
        q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
813
#endif
Andreas Marek's avatar
Andreas Marek committed
814
        h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
815
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
816 817 818
        q1 = _AVX_NFMA(y1, h2, q1);
        q2 = _AVX_NFMA(y2, h2, q2);
        q3 = _AVX_NFMA(y3, h2, q3);
819
#else
Andreas Marek's avatar
Andreas Marek committed
820 821 822
        q1 = _AVX_SUB(q1, _AVX_MUL(y1,h2));
        q2 = _AVX_SUB(q2, _AVX_MUL(y2,h2));
        q3 = _AVX_SUB(q3, _AVX_MUL(y3,h2));
823
#endif
Andreas Marek's avatar
Andreas Marek committed
824 825 826
        _AVX_STORE(&q[(nb+1)*ldq],q1);
        _AVX_STORE(&q[((nb+1)*ldq)+offset],q2);
        _AVX_STORE(&q[((nb+1)*ldq)+2*offset],q3);
827

Andreas Marek's avatar
Andreas Marek committed
828 829 830 831
        h1 = _AVX_BROADCAST(&hh[nb-1]);
        q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
        q3 = _AVX_LOAD(&q[((nb+2)*ldq)+2*offset]);
832
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
833 834 835
        q1 = _AVX_NFMA(x1, h1, q1);
        q2 = _AVX_NFMA(x2, h1, q2);
        q3 = _AVX_NFMA(x3, h1, q3);
836
#else
Andreas Marek's avatar
Andreas Marek committed
837 838 839
        q1 = _AVX_SUB(q1, _AVX_MUL(x1,h1));
        q2 = _AVX_SUB(q2, _AVX_MUL(x2,h1));
        q3 = _AVX_SUB(q3, _AVX_MUL(x3,h1));
840
#endif
Andreas Marek's avatar
Andreas Marek committed
841 842 843
        _AVX_STORE(&q[(nb+2)*ldq],q1);
        _AVX_STORE(&q[((nb+2)*ldq)+offset],q2);
        _AVX_STORE(&q[((nb+2)*ldq)+2*offset],q3);
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif

 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [4 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;

        __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*3]);
        __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*2]);
        __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq]);
        __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[0]);

        __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
        __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
        __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
        __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
        __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
        __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);

#ifdef __ELPA_USE_FMA__
        __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
        w1 = _AVX_FMA(a2_1, h_4_2, w1);
        w1 = _AVX_FMA(a1_1, h_4_1, w1);
        __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
        z1 = _AVX_FMA(a1_1, h_3_1, z1);
        __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
        __AVX_DATATYPE x1 = a1_1;
#else
        __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
        w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
        w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
        __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
        z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
        __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
        __AVX_DATATYPE x1 = a1_1;
#endif

        __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
        __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
        __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[ldq+offset]);
        __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[0+offset]);

906
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
907 908 909 910 911 912 913
        __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
        w2 = _AVX_FMA(a2_2, h_4_2, w2);
        w2 = _AVX_FMA(a1_2, h_4_1, w2);
        __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
        z2 = _AVX_FMA(a1_2, h_3_1, z2);
        __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
        __AVX_DATATYPE x2 = a1_2;
914
#else
Andreas Marek's avatar
Andreas Marek committed
915 916 917 918 919 920 921
        __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
        w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
        w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
        __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
        z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
        __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
        __AVX_DATATYPE x2 = a1_2;
922 923
#endif

Andreas Marek's avatar
Andreas Marek committed
924 925
        __AVX_DATATYPE q1;
        __AVX_DATATYPE q2;
926

Andreas Marek's avatar
Andreas Marek committed
927 928 929 930
        __AVX_DATATYPE h1;
        __AVX_DATATYPE h2;
        __AVX_DATATYPE h3;
        __AVX_DATATYPE h4;
931

Andreas Marek's avatar
Andreas Marek committed
932 933 934 935 936 937
        for(i = 4; i < nb; i++)
        {
                h1 = _AVX_BROADCAST(&hh[i-3]);
                h2 = _AVX_BROADCAST(&hh[ldh+i-2]);
                h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-1]);
                h4 = _AVX_BROADCAST(&hh[(ldh*3)+i]);
938

Andreas Marek's avatar
Andreas Marek committed
939
                q1 = _AVX_LOAD(&q[i*ldq]);
940
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
941 942 943 944
                x1 = _AVX_FMA(q1, h1, x1);
                y1 = _AVX_FMA(q1, h2, y1);
                z1 = _AVX_FMA(q1, h3, z1);
                w1 = _AVX_FMA(q1, h4, w1);
945
#else
Andreas Marek's avatar
Andreas Marek committed
946 947 948 949
                x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
                y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
                z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
                w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
950 951
#endif

Andreas Marek's avatar
Andreas Marek committed
952
                q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
953
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
954 955 956 957
                x2 = _AVX_FMA(q2, h1, x2);
                y2 = _AVX_FMA(q2, h2, y2);
                z2 = _AVX_FMA(q2, h3, z2);
                w2 = _AVX_FMA(q2, h4, w2);
958
#else
Andreas Marek's avatar
Andreas Marek committed
959 960 961 962
                x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
                y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
                z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
                w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
963
#endif
Andreas Marek's avatar
Andreas Marek committed
964
        }
965

Andreas Marek's avatar
Andreas Marek committed
966 967 968
        h1 = _AVX_BROADCAST(&hh[nb-3]);
        h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
969

Andreas Marek's avatar
Andreas Marek committed
970 971
        q1 = _AVX_LOAD(&q[nb*ldq]);
        q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
972 973

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
974 975 976 977 978 979
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
        z1 = _AVX_FMA(q1, h3, z1);
        z2 = _AVX_FMA(q2, h3, z2);
980
#else
Andreas Marek's avatar
Andreas Marek committed
981 982 983 984 985 986
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
        z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
        z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
987 988
#endif

Andreas Marek's avatar
Andreas Marek committed
989 990
        h1 = _AVX_BROADCAST(&hh[nb-2]);
        h2 = _AVX_BROADCAST(&hh[(ldh*1)+nb-1]);
991

Andreas Marek's avatar
Andreas Marek committed
992 993
        q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
994 995

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
996 997 998