real_avx-avx2_6hv_template.c 76.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

#include <x86intrin.h>
66 67
#include <stdio.h>
#include <stdlib.h>
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191

#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_SUB _mm256_sub_pd
#define _AVX_MUL _mm256_mul_pd
#define _AVX_BROADCAST _mm256_broadcast_sd

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif

#endif

#define _AVX_FMA _mm256_FMA_pd
#define _AVX_NFMA _mm256_NFMA_pd
#define _AVX_FMSUB _mm256_FMSUB_pd
#endif /* DOUBLE_PRECISION_REAL */

#ifdef SINGLE_PRECISION_REAL
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_SUB _mm256_sub_ps
#define _AVX_MUL _mm256_mul_ps
#define _AVX_BROADCAST _mm256_broadcast_ss

#ifdef HAVE_AVX2

#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif

#endif

#define _AVX_FMA _mm256_FMA_ps
#define _AVX_NFMA _mm256_NFMA_ps
#define _AVX_FMSUB _mm256_FMSUB_ps
#endif /* SINGLE_PRECISION_REAL */

#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);

void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);

void hexa_hh_trafo_real_avx_avx2_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);

#endif

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine hexa_hh_trafo_real_avx_avx2_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f>   subroutine hexa_hh_trafo_real_avx_avx2_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_float)      :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
192 193 194 195 196 197
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
198

Andreas Marek's avatar
Andreas Marek committed
199
        worked_on = 0;
200

Andreas Marek's avatar
Andreas Marek committed
201 202
        // calculating scalar products to compute
        // 6 householder vectors simultaneously
203
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
204
        double scalarprods[15];
205 206
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
        float scalarprods[15];
#endif

        scalarprods[0] = hh[(ldh+1)];
        scalarprods[1] = hh[(ldh*2)+2];
        scalarprods[2] = hh[(ldh*2)+1];
        scalarprods[3] = hh[(ldh*3)+3];
        scalarprods[4] = hh[(ldh*3)+2];
        scalarprods[5] = hh[(ldh*3)+1];
        scalarprods[6] = hh[(ldh*4)+4];
        scalarprods[7] = hh[(ldh*4)+3];
        scalarprods[8] = hh[(ldh*4)+2];
        scalarprods[9] = hh[(ldh*4)+1];
        scalarprods[10] = hh[(ldh*5)+5];
        scalarprods[11] = hh[(ldh*5)+4];
        scalarprods[12] = hh[(ldh*5)+3];
        scalarprods[13] = hh[(ldh*5)+2];
        scalarprods[14] = hh[(ldh*5)+1];

        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
        scalarprods[0] += hh[1] * hh[(2+ldh)];
        scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];

        // loop counter = 3
        scalarprods[0] += hh[2] * hh[(3+ldh)];
        scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];

        scalarprods[1] += hh[1] * hh[3+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];

        // loop counter = 4
        scalarprods[0] += hh[3] * hh[(4+ldh)];
        scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];

        scalarprods[1] += hh[2] * hh[4+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];

        scalarprods[3] += hh[1] * hh[4+(ldh*3)];
        scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];

        // loop counter = 5
        scalarprods[0] += hh[4] * hh[(5+ldh)];
        scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];

        scalarprods[1] += hh[3] * hh[5+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];

        scalarprods[3] += hh[2] * hh[5+(ldh*3)];
        scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];

        scalarprods[6] += hh[1] * hh[5+(ldh*4)];
        scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];

        #pragma ivdep
        for (i = 6; i < nb; i++)
        {
                scalarprods[0] += hh[i-1] * hh[(i+ldh)];
                scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
                scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
                scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
                scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];

                scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
                scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
                scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
                scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];

                scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
                scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
                scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];

                scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
                scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];

                scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
        }

        // Production level kernel calls with padding
306
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
307 308 309 310 311
        for (i = 0; i < nq-4; i+=8)
        {
                hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 8;
        }
312 313
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
314 315 316 317 318 319 320 321 322 323
        for (i = 0; i < nq-8; i+=16)
        {
                hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 16;
        }
#endif
        if (nq == i)
        {
                return;
        }
324
#ifdef DOUBLE_PRECISION_REAL
325
        if (nq-i == 4)
Andreas Marek's avatar
Andreas Marek committed
326 327 328 329
        {
                hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 4;
        }
330 331
#endif
#ifdef SINGLE_PRECISION_REAL
332
        if (nq-i == 8)
Andreas Marek's avatar
Andreas Marek committed
333 334 335 336
        {
                hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 8;
        }
337
#endif
338
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
339 340 341 342 343
        if (worked_on != nq)
        {
            printf("Error in real AVX/AVX2 BLOCK6 kernel \n");
            abort();
         }
344
#endif
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [8 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;

        __AVX_DATATYPE a1_1 = _AVX_LOAD(&q[ldq*5]);
        __AVX_DATATYPE a2_1 = _AVX_LOAD(&q[ldq*4]);
        __AVX_DATATYPE a3_1 = _AVX_LOAD(&q[ldq*3]);
        __AVX_DATATYPE a4_1 = _AVX_LOAD(&q[ldq*2]);
        __AVX_DATATYPE a5_1 = _AVX_LOAD(&q[ldq]);
        __AVX_DATATYPE a6_1 = _AVX_LOAD(&q[0]);

        __AVX_DATATYPE h_6_5 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
        __AVX_DATATYPE h_6_4 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
        __AVX_DATATYPE h_6_3 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
        __AVX_DATATYPE h_6_2 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
        __AVX_DATATYPE h_6_1 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE t1 = _AVX_FMA(a5_1, h_6_5, a6_1);
        t1 = _AVX_FMA(a4_1, h_6_4, t1);
        t1 = _AVX_FMA(a3_1, h_6_3, t1);
        t1 = _AVX_FMA(a2_1, h_6_2, t1);
        t1 = _AVX_FMA(a1_1, h_6_1, t1);
#else
        register __AVX_DATATYPE t1 = _AVX_ADD(a6_1, _AVX_MUL(a5_1, h_6_5));
        t1 = _AVX_ADD(t1, _AVX_MUL(a4_1, h_6_4));
        t1 = _AVX_ADD(t1, _AVX_MUL(a3_1, h_6_3));
        t1 = _AVX_ADD(t1, _AVX_MUL(a2_1, h_6_2));
        t1 = _AVX_ADD(t1, _AVX_MUL(a1_1, h_6_1));
#endif
        __AVX_DATATYPE h_5_4 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
        __AVX_DATATYPE h_5_3 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
        __AVX_DATATYPE h_5_2 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
        __AVX_DATATYPE h_5_1 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE v1 = _AVX_FMA(a4_1, h_5_4, a5_1);
        v1 = _AVX_FMA(a3_1, h_5_3, v1);
        v1 = _AVX_FMA(a2_1, h_5_2, v1);
        v1 = _AVX_FMA(a1_1, h_5_1, v1);
#else
        register __AVX_DATATYPE v1 = _AVX_ADD(a5_1, _AVX_MUL(a4_1, h_5_4));
        v1 = _AVX_ADD(v1, _AVX_MUL(a3_1, h_5_3));
        v1 = _AVX_ADD(v1, _AVX_MUL(a2_1, h_5_2));
        v1 = _AVX_ADD(v1, _AVX_MUL(a1_1, h_5_1));
#endif
        __AVX_DATATYPE h_4_3 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
        __AVX_DATATYPE h_4_2 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
        __AVX_DATATYPE h_4_1 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE w1 = _AVX_FMA(a3_1, h_4_3, a4_1);
        w1 = _AVX_FMA(a2_1, h_4_2, w1);
        w1 = _AVX_FMA(a1_1, h_4_1, w1);
#else
        register __AVX_DATATYPE w1 = _AVX_ADD(a4_1, _AVX_MUL(a3_1, h_4_3));
        w1 = _AVX_ADD(w1, _AVX_MUL(a2_1, h_4_2));
        w1 = _AVX_ADD(w1, _AVX_MUL(a1_1, h_4_1));
#endif
        __AVX_DATATYPE h_2_1 = _AVX_BROADCAST(&hh[ldh+1]);
        __AVX_DATATYPE h_3_2 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
        __AVX_DATATYPE h_3_1 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE z1 = _AVX_FMA(a2_1, h_3_2, a3_1);
        z1 = _AVX_FMA(a1_1, h_3_1, z1);
        register __AVX_DATATYPE y1 = _AVX_FMA(a1_1, h_2_1, a2_1);
#else
        register __AVX_DATATYPE z1 = _AVX_ADD(a3_1, _AVX_MUL(a2_1, h_3_2));
        z1 = _AVX_ADD(z1, _AVX_MUL(a1_1, h_3_1));
        register __AVX_DATATYPE y1 = _AVX_ADD(a2_1, _AVX_MUL(a1_1, h_2_1));
#endif
        register __AVX_DATATYPE x1 = a1_1;

        __AVX_DATATYPE a1_2 = _AVX_LOAD(&q[(ldq*5)+offset]);
        __AVX_DATATYPE a2_2 = _AVX_LOAD(&q[(ldq*4)+offset]);
        __AVX_DATATYPE a3_2 = _AVX_LOAD(&q[(ldq*3)+offset]);
        __AVX_DATATYPE a4_2 = _AVX_LOAD(&q[(ldq*2)+offset]);
        __AVX_DATATYPE a5_2 = _AVX_LOAD(&q[(ldq)+offset]);
        __AVX_DATATYPE a6_2 = _AVX_LOAD(&q[offset]);

#ifdef __ELPA_USE_FMA__
        register __AVX_DATATYPE t2 = _AVX_FMA(a5_2, h_6_5, a6_2);
        t2 = _AVX_FMA(a4_2, h_6_4, t2);
        t2 = _AVX_FMA(a3_2, h_6_3, t2);
        t2 = _AVX_FMA(a2_2, h_6_2, t2);
        t2 = _AVX_FMA(a1_2, h_6_1, t2);
        register __AVX_DATATYPE v2 = _AVX_FMA(a4_2, h_5_4, a5_2);
        v2 = _AVX_FMA(a3_2, h_5_3, v2);
        v2 = _AVX_FMA(a2_2, h_5_2, v2);
        v2 = _AVX_FMA(a1_2, h_5_1, v2);
        register __AVX_DATATYPE w2 = _AVX_FMA(a3_2, h_4_3, a4_2);
        w2 = _AVX_FMA(a2_2, h_4_2, w2);
        w2 = _AVX_FMA(a1_2, h_4_1, w2);
        register __AVX_DATATYPE z2 = _AVX_FMA(a2_2, h_3_2, a3_2);
        z2 = _AVX_FMA(a1_2, h_3_1, z2);
        register __AVX_DATATYPE y2 = _AVX_FMA(a1_2, h_2_1, a2_2);
#else
        register __AVX_DATATYPE t2 = _AVX_ADD(a6_2, _AVX_MUL(a5_2, h_6_5));
        t2 = _AVX_ADD(t2, _AVX_MUL(a4_2, h_6_4));
        t2 = _AVX_ADD(t2, _AVX_MUL(a3_2, h_6_3));
        t2 = _AVX_ADD(t2, _AVX_MUL(a2_2, h_6_2));
        t2 = _AVX_ADD(t2, _AVX_MUL(a1_2, h_6_1));
        register __AVX_DATATYPE v2 = _AVX_ADD(a5_2, _AVX_MUL(a4_2, h_5_4));
        v2 = _AVX_ADD(v2, _AVX_MUL(a3_2, h_5_3));
        v2 = _AVX_ADD(v2, _AVX_MUL(a2_2, h_5_2));
        v2 = _AVX_ADD(v2, _AVX_MUL(a1_2, h_5_1));
        register __AVX_DATATYPE w2 = _AVX_ADD(a4_2, _AVX_MUL(a3_2, h_4_3));
        w2 = _AVX_ADD(w2, _AVX_MUL(a2_2, h_4_2));
        w2 = _AVX_ADD(w2, _AVX_MUL(a1_2, h_4_1));
        register __AVX_DATATYPE z2 = _AVX_ADD(a3_2, _AVX_MUL(a2_2, h_3_2));
        z2 = _AVX_ADD(z2, _AVX_MUL(a1_2, h_3_1));
        register __AVX_DATATYPE y2 = _AVX_ADD(a2_2, _AVX_MUL(a1_2, h_2_1));
476
#endif
Andreas Marek's avatar
Andreas Marek committed
477
        register __AVX_DATATYPE x2 = a1_2;
478

Andreas Marek's avatar
Andreas Marek committed
479 480
        __AVX_DATATYPE q1;
        __AVX_DATATYPE q2;
481

Andreas Marek's avatar
Andreas Marek committed
482 483 484 485 486 487
        __AVX_DATATYPE h1;
        __AVX_DATATYPE h2;
        __AVX_DATATYPE h3;
        __AVX_DATATYPE h4;
        __AVX_DATATYPE h5;
        __AVX_DATATYPE h6;
488

Andreas Marek's avatar
Andreas Marek committed
489 490 491 492 493
        for(i = 6; i < nb; i++)
        {
                h1 = _AVX_BROADCAST(&hh[i-5]);
                q1 = _AVX_LOAD(&q[i*ldq]);
                q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
494
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
495 496
                x1 = _AVX_FMA(q1, h1, x1);
                x2 = _AVX_FMA(q2, h1, x2);
497
#else
Andreas Marek's avatar
Andreas Marek committed
498 499
                x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
                x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
500
#endif
Andreas Marek's avatar
Andreas Marek committed
501
                h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
502
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
503 504
                y1 = _AVX_FMA(q1, h2, y1);
                y2 = _AVX_FMA(q2, h2, y2);
505
#else
Andreas Marek's avatar
Andreas Marek committed
506 507
                y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
                y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
508
#endif
Andreas Marek's avatar
Andreas Marek committed
509
                h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
510
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
511 512
                z1 = _AVX_FMA(q1, h3, z1);
                z2 = _AVX_FMA(q2, h3, z2);
513
#else
Andreas Marek's avatar
Andreas Marek committed
514 515
                z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
                z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
516
#endif
Andreas Marek's avatar
Andreas Marek committed
517
                h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
518
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
519 520
                w1 = _AVX_FMA(q1, h4, w1);
                w2 = _AVX_FMA(q2, h4, w2);
521
#else
Andreas Marek's avatar
Andreas Marek committed
522 523
                w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
                w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
524
#endif
Andreas Marek's avatar
Andreas Marek committed
525
                h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
526
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
527 528
                v1 = _AVX_FMA(q1, h5, v1);
                v2 = _AVX_FMA(q2, h5, v2);
529
#else
Andreas Marek's avatar
Andreas Marek committed
530 531
                v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
                v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
532
#endif
Andreas Marek's avatar
Andreas Marek committed
533
                h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
534
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
535 536
                t1 = _AVX_FMA(q1, h6, t1);
                t2 = _AVX_FMA(q2, h6, t2);
537
#else
Andreas Marek's avatar
Andreas Marek committed
538 539
                t1 = _AVX_ADD(t1, _AVX_MUL(q1,h6));
                t2 = _AVX_ADD(t2, _AVX_MUL(q2,h6));
540
#endif
Andreas Marek's avatar
Andreas Marek committed
541
        }
542

Andreas Marek's avatar
Andreas Marek committed
543 544 545
        h1 = _AVX_BROADCAST(&hh[nb-5]);
        q1 = _AVX_LOAD(&q[nb*ldq]);
        q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
546
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
547 548
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
549
#else
Andreas Marek's avatar
Andreas Marek committed
550 551
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
552
#endif
Andreas Marek's avatar
Andreas Marek committed
553
        h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
554
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
555 556
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
557
#else
Andreas Marek's avatar
Andreas Marek committed
558 559
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
560
#endif
Andreas Marek's avatar
Andreas Marek committed
561
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-3]);
562
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
563 564
        z1 = _AVX_FMA(q1, h3, z1);
        z2 = _AVX_FMA(q2, h3, z2);
565
#else
Andreas Marek's avatar
Andreas Marek committed
566 567
        z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
        z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
568
#endif
Andreas Marek's avatar
Andreas Marek committed
569
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-2]);
570
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
571 572
        w1 = _AVX_FMA(q1, h4, w1);
        w2 = _AVX_FMA(q2, h4, w2);
573
#else
Andreas Marek's avatar
Andreas Marek committed
574 575
        w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
        w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
576
#endif
Andreas Marek's avatar
Andreas Marek committed
577
        h5 = _AVX_BROADCAST(&hh[(ldh*4)+nb-1]);
578
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
579 580
        v1 = _AVX_FMA(q1, h5, v1);
        v2 = _AVX_FMA(q2, h5, v2);
581
#else
Andreas Marek's avatar
Andreas Marek committed
582 583
        v1 = _AVX_ADD(v1, _AVX_MUL(q1,h5));
        v2 = _AVX_ADD(v2, _AVX_MUL(q2,h5));
584 585
#endif

Andreas Marek's avatar
Andreas Marek committed
586 587 588
        h1 = _AVX_BROADCAST(&hh[nb-4]);
        q1 = _AVX_LOAD(&q[(nb+1)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+1)*ldq)+offset]);
589
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
590 591
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
592
#else
Andreas Marek's avatar
Andreas Marek committed
593 594
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
595
#endif
Andreas Marek's avatar
Andreas Marek committed
596
        h2 = _AVX_BROADCAST(&hh[ldh+nb-3]);
597
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
598 599
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
600
#else
Andreas Marek's avatar
Andreas Marek committed
601 602
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
603
#endif
Andreas Marek's avatar
Andreas Marek committed
604
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-2]);
605
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
606 607
        z1 = _AVX_FMA(q1, h3, z1);
        z2 = _AVX_FMA(q2, h3, z2);
608
#else
Andreas Marek's avatar
Andreas Marek committed
609 610
        z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
        z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
611
#endif
Andreas Marek's avatar
Andreas Marek committed
612
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+nb-1]);
613
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
614 615
        w1 = _AVX_FMA(q1, h4, w1);
        w2 = _AVX_FMA(q2, h4, w2);
616
#else
Andreas Marek's avatar
Andreas Marek committed
617 618
        w1 = _AVX_ADD(w1, _AVX_MUL(q1,h4));
        w2 = _AVX_ADD(w2, _AVX_MUL(q2,h4));
619 620
#endif

Andreas Marek's avatar
Andreas Marek committed
621 622 623
        h1 = _AVX_BROADCAST(&hh[nb-3]);
        q1 = _AVX_LOAD(&q[(nb+2)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+2)*ldq)+offset]);
624
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
625 626
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
627
#else
Andreas Marek's avatar
Andreas Marek committed
628 629
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
630
#endif
Andreas Marek's avatar
Andreas Marek committed
631
        h2 = _AVX_BROADCAST(&hh[ldh+nb-2]);
632
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
633 634
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
635
#else
Andreas Marek's avatar
Andreas Marek committed
636 637
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
638
#endif
Andreas Marek's avatar
Andreas Marek committed
639
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+nb-1]);
640
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
641 642
        z1 = _AVX_FMA(q1, h3, z1);
        z2 = _AVX_FMA(q2, h3, z2);
643
#else
Andreas Marek's avatar
Andreas Marek committed
644 645
        z1 = _AVX_ADD(z1, _AVX_MUL(q1,h3));
        z2 = _AVX_ADD(z2, _AVX_MUL(q2,h3));
646 647
#endif

Andreas Marek's avatar
Andreas Marek committed
648 649 650
        h1 = _AVX_BROADCAST(&hh[nb-2]);
        q1 = _AVX_LOAD(&q[(nb+3)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+3)*ldq)+offset]);
651
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
652 653
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
654
#else
Andreas Marek's avatar
Andreas Marek committed
655 656
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
657
#endif
Andreas Marek's avatar
Andreas Marek committed
658
        h2 = _AVX_BROADCAST(&hh[ldh+nb-1]);
659
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
660 661
        y1 = _AVX_FMA(q1, h2, y1);
        y2 = _AVX_FMA(q2, h2, y2);
662
#else
Andreas Marek's avatar
Andreas Marek committed
663 664
        y1 = _AVX_ADD(y1, _AVX_MUL(q1,h2));
        y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
665 666
#endif

Andreas Marek's avatar
Andreas Marek committed
667 668 669
        h1 = _AVX_BROADCAST(&hh[nb-1]);
        q1 = _AVX_LOAD(&q[(nb+4)*ldq]);
        q2 = _AVX_LOAD(&q[((nb+4)*ldq)+offset]);
670
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
671 672
        x1 = _AVX_FMA(q1, h1, x1);
        x2 = _AVX_FMA(q2, h1, x2);
673
#else
Andreas Marek's avatar
Andreas Marek committed
674 675
        x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
        x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
676 677
#endif

Andreas Marek's avatar
Andreas Marek committed
678 679 680
        /////////////////////////////////////////////////////
        // Apply tau, correct wrong calculation using pre-calculated scalar products
        /////////////////////////////////////////////////////
681

Andreas Marek's avatar
Andreas Marek committed
682 683 684
        __AVX_DATATYPE tau1 = _AVX_BROADCAST(&hh[0]);
        x1 = _AVX_MUL(x1, tau1);
        x2 = _AVX_MUL(x2, tau1);
685

Andreas Marek's avatar
Andreas Marek committed
686 687 688
        __AVX_DATATYPE tau2 = _AVX_BROADCAST(&hh[ldh]);
        __AVX_DATATYPE vs_1_2 = _AVX_BROADCAST(&scalarprods[0]);
        h2 = _AVX_MUL(tau2, vs_1_2);
689
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
690 691
        y1 = _AVX_FMSUB(y1, tau2, _AVX_MUL(x1,h2));
        y2 = _AVX_FMSUB(y2, tau2, _AVX_MUL(x2,h2));
692
#else
Andreas Marek's avatar
Andreas Marek committed
693 694
        y1 = _AVX_SUB(_AVX_MUL(y1,tau2), _AVX_MUL(x1,h2));
        y2 = _AVX_SUB(_AVX_MUL(y2,tau2), _AVX_MUL(x2,h2));
695 696
#endif

Andreas Marek's avatar
Andreas Marek committed
697 698 699 700 701
        __AVX_DATATYPE tau3 = _AVX_BROADCAST(&hh[ldh*2]);
        __AVX_DATATYPE vs_1_3 = _AVX_BROADCAST(&scalarprods[1]);
        __AVX_DATATYPE vs_2_3 = _AVX_BROADCAST(&scalarprods[2]);
        h2 = _AVX_MUL(tau3, vs_1_3);
        h3 = _AVX_MUL(tau3, vs_2_3);
702
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
703 704
        z1 = _AVX_FMSUB(z1, tau3, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)));
        z2 = _AVX_FMSUB(z2, tau3, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)));
705
#else
Andreas Marek's avatar
Andreas Marek committed
706 707
        z1 = _AVX_SUB(_AVX_MUL(z1,tau3), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)));
        z2 = _AVX_SUB(_AVX_MUL(z2,tau3), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)));
708 709
#endif

Andreas Marek's avatar
Andreas Marek committed
710 711 712 713 714 715 716
        __AVX_DATATYPE tau4 = _AVX_BROADCAST(&hh[ldh*3]);
        __AVX_DATATYPE vs_1_4 = _AVX_BROADCAST(&scalarprods[3]);
        __AVX_DATATYPE vs_2_4 = _AVX_BROADCAST(&scalarprods[4]);
        h2 = _AVX_MUL(tau4, vs_1_4);
        h3 = _AVX_MUL(tau4, vs_2_4);
        __AVX_DATATYPE vs_3_4 = _AVX_BROADCAST(&scalarprods[5]);
        h4 = _AVX_MUL(tau4, vs_3_4);
717
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
718 719
        w1 = _AVX_FMSUB(w1, tau4, _AVX_FMA(z1, h4, _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
        w2 = _AVX_FMSUB(w2, tau4, _AVX_FMA(z2, h4, _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
720
#else
Andreas Marek's avatar
Andreas Marek committed
721 722
        w1 = _AVX_SUB(_AVX_MUL(w1,tau4), _AVX_ADD(_AVX_MUL(z1,h4), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
        w2 = _AVX_SUB(_AVX_MUL(w2,tau4), _AVX_ADD(_AVX_MUL(z2,h4), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
723 724
#endif

Andreas Marek's avatar
Andreas Marek committed
725 726 727 728 729 730 731 732 733
        __AVX_DATATYPE tau5 = _AVX_BROADCAST(&hh[ldh*4]);
        __AVX_DATATYPE vs_1_5 = _AVX_BROADCAST(&scalarprods[6]);
        __AVX_DATATYPE vs_2_5 = _AVX_BROADCAST(&scalarprods[7]);
        h2 = _AVX_MUL(tau5, vs_1_5);
        h3 = _AVX_MUL(tau5, vs_2_5);
        __AVX_DATATYPE vs_3_5 = _AVX_BROADCAST(&scalarprods[8]);
        __AVX_DATATYPE vs_4_5 = _AVX_BROADCAST(&scalarprods[9]);
        h4 = _AVX_MUL(tau5, vs_3_5);
        h5 = _AVX_MUL(tau5, vs_4_5);
734
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
735 736
        v1 = _AVX_FMSUB(v1, tau5, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2))));
        v2 = _AVX_FMSUB(v2, tau5, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2))));
737
#else
Andreas Marek's avatar
Andreas Marek committed
738 739
        v1 = _AVX_SUB(_AVX_MUL(v1,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2))));
        v2 = _AVX_SUB(_AVX_MUL(v2,tau5), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2))));
740 741
#endif

Andreas Marek's avatar
Andreas Marek committed
742 743 744 745 746 747 748 749 750 751 752
        __AVX_DATATYPE tau6 = _AVX_BROADCAST(&hh[ldh*5]);
        __AVX_DATATYPE vs_1_6 = _AVX_BROADCAST(&scalarprods[10]);
        __AVX_DATATYPE vs_2_6 = _AVX_BROADCAST(&scalarprods[11]);
        h2 = _AVX_MUL(tau6, vs_1_6);
        h3 = _AVX_MUL(tau6, vs_2_6);
        __AVX_DATATYPE vs_3_6 = _AVX_BROADCAST(&scalarprods[12]);
        __AVX_DATATYPE vs_4_6 = _AVX_BROADCAST(&scalarprods[13]);
        __AVX_DATATYPE vs_5_6 = _AVX_BROADCAST(&scalarprods[14]);
        h4 = _AVX_MUL(tau6, vs_3_6);
        h5 = _AVX_MUL(tau6, vs_4_6);
        h6 = _AVX_MUL(tau6, vs_5_6);
753
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
754 755
        t1 = _AVX_FMSUB(t1, tau6, _AVX_FMA(v1, h6, _AVX_ADD(_AVX_FMA(w1, h5, _AVX_MUL(z1,h4)), _AVX_FMA(y1, h3, _AVX_MUL(x1,h2)))));
        t2 = _AVX_FMSUB(t2, tau6, _AVX_FMA(v2, h6, _AVX_ADD(_AVX_FMA(w2, h5, _AVX_MUL(z2,h4)), _AVX_FMA(y2, h3, _AVX_MUL(x2,h2)))));
756
#else
Andreas Marek's avatar
Andreas Marek committed
757 758
        t1 = _AVX_SUB(_AVX_MUL(t1,tau6), _AVX_ADD( _AVX_MUL(v1,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w1,h5), _AVX_MUL(z1,h4)), _AVX_ADD(_AVX_MUL(y1,h3), _AVX_MUL(x1,h2)))));
        t2 = _AVX_SUB(_AVX_MUL(t2,tau6), _AVX_ADD( _AVX_MUL(v2,h6), _AVX_ADD(_AVX_ADD(_AVX_MUL(w2,h5), _AVX_MUL(z2,h4)), _AVX_ADD(_AVX_MUL(y2,h3), _AVX_MUL(x2,h2)))));
759 760
#endif

Andreas Marek's avatar
Andreas Marek committed
761 762 763
        /////////////////////////////////////////////////////
        // Rank-1 update of Q [8 x nb+3]
        /////////////////////////////////////////////////////
764

Andreas Marek's avatar
Andreas Marek committed
765 766 767 768 769 770
        q1 = _AVX_LOAD(&q[0]);
        q2 = _AVX_LOAD(&q[offset]);
        q1 = _AVX_SUB(q1, t1);
        q2 = _AVX_SUB(q2, t2);
        _AVX_STORE(&q[0],q1);
        _AVX_STORE(&q[offset],q2);
771

Andreas Marek's avatar
Andreas Marek committed
772 773 774 775 776
        h6 = _AVX_BROADCAST(&hh[(ldh*5)+1]);
        q1 = _AVX_LOAD(&q[ldq]);
        q2 = _AVX_LOAD(&q[(ldq+offset)]);
        q1 = _AVX_SUB(q1, v1);
        q2 = _AVX_SUB(q2, v2);
777
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
778 779
        q1 = _AVX_NFMA(t1, h6, q1);
        q2 = _AVX_NFMA(t2, h6, q2);
780
#else
Andreas Marek's avatar
Andreas Marek committed
781 782
        q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
        q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
783
#endif
Andreas Marek's avatar
Andreas Marek committed
784 785
        _AVX_STORE(&q[ldq],q1);
        _AVX_STORE(&q[(ldq+offset)],q2);
786

Andreas Marek's avatar
Andreas Marek committed
787 788 789 790 791
        h5 = _AVX_BROADCAST(&hh[(ldh*4)+1]);
        q1 = _AVX_LOAD(&q[ldq*2]);
        q2 = _AVX_LOAD(&q[(ldq*2)+offset]);
        q1 = _AVX_SUB(q1, w1);
        q2 = _AVX_SUB(q2, w2);
792
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
793 794
        q1 = _AVX_NFMA(v1, h5, q1);
        q2 = _AVX_NFMA(v2, h5, q2);
795
#else
Andreas Marek's avatar
Andreas Marek committed
796 797
        q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
        q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
798
#endif
Andreas Marek's avatar
Andreas Marek committed
799
        h6 = _AVX_BROADCAST(&hh[(ldh*5)+2]);
800
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
801 802
        q1 = _AVX_NFMA(t1, h6, q1);
        q2 = _AVX_NFMA(t2, h6, q2);
803
#else
Andreas Marek's avatar
Andreas Marek committed
804 805
        q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
        q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
806
#endif
Andreas Marek's avatar
Andreas Marek committed
807 808
        _AVX_STORE(&q[ldq*2],q1);
        _AVX_STORE(&q[(ldq*2)+offset],q2);
809

Andreas Marek's avatar
Andreas Marek committed
810 811 812 813 814
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+1]);
        q1 = _AVX_LOAD(&q[ldq*3]);
        q2 = _AVX_LOAD(&q[(ldq*3)+offset]);
        q1 = _AVX_SUB(q1, z1);
        q2 = _AVX_SUB(q2, z2);
815
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
816 817
        q1 = _AVX_NFMA(w1, h4, q1);
        q2 = _AVX_NFMA(w2, h4, q2);
818
#else
Andreas Marek's avatar
Andreas Marek committed
819 820
        q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
        q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
821
#endif
Andreas Marek's avatar
Andreas Marek committed
822
        h5 = _AVX_BROADCAST(&hh[(ldh*4)+2]);
823
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
824 825
        q1 = _AVX_NFMA(v1, h5, q1);
        q2 = _AVX_NFMA(v2, h5, q2);
826
#else
Andreas Marek's avatar
Andreas Marek committed
827 828
        q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
        q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
829
#endif
Andreas Marek's avatar
Andreas Marek committed
830
        h6 = _AVX_BROADCAST(&hh[(ldh*5)+3]);
831
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
832 833
        q1 = _AVX_NFMA(t1, h6, q1);
        q2 = _AVX_NFMA(t2, h6, q2);
834
#else
Andreas Marek's avatar
Andreas Marek committed
835 836
        q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
        q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
837
#endif
Andreas Marek's avatar
Andreas Marek committed
838 839
        _AVX_STORE(&q[ldq*3],q1);
        _AVX_STORE(&q[(ldq*3)+offset],q2);
840

Andreas Marek's avatar
Andreas Marek committed
841 842 843 844 845
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+1]);
        q1 = _AVX_LOAD(&q[ldq*4]);
        q2 = _AVX_LOAD(&q[(ldq*4)+offset]);
        q1 = _AVX_SUB(q1, y1);
        q2 = _AVX_SUB(q2, y2);
846
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
847 848
        q1 = _AVX_NFMA(z1, h3, q1);
        q2 = _AVX_NFMA(z2, h3, q2);
849
#else
Andreas Marek's avatar
Andreas Marek committed
850 851
        q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
        q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
852
#endif
Andreas Marek's avatar
Andreas Marek committed
853
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+2]);
854
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
855 856
        q1 = _AVX_NFMA(w1, h4, q1);
        q2 = _AVX_NFMA(w2, h4, q2);
857
#else
Andreas Marek's avatar
Andreas Marek committed
858 859
        q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
        q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
860
#endif
Andreas Marek's avatar
Andreas Marek committed
861
        h5 = _AVX_BROADCAST(&hh[(ldh*4)+3]);
862
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
863 864
        q1 = _AVX_NFMA(v1, h5, q1);
        q2 = _AVX_NFMA(v2, h5, q2);
865
#else
Andreas Marek's avatar
Andreas Marek committed
866 867
        q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
        q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
868
#endif
Andreas Marek's avatar
Andreas Marek committed
869
        h6 = _AVX_BROADCAST(&hh[(ldh*5)+4]);
870
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
871 872
        q1 = _AVX_NFMA(t1, h6, q1);
        q2 = _AVX_NFMA(t2, h6, q2);
873
#else
Andreas Marek's avatar
Andreas Marek committed
874 875
        q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
        q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
876
#endif
Andreas Marek's avatar
Andreas Marek committed
877 878
        _AVX_STORE(&q[ldq*4],q1);
        _AVX_STORE(&q[(ldq*4)+offset],q2);
879

Andreas Marek's avatar
Andreas Marek committed
880 881 882 883 884
        h2 = _AVX_BROADCAST(&hh[(ldh)+1]);
        q1 = _AVX_LOAD(&q[ldq*5]);
        q2 = _AVX_LOAD(&q[(ldq*5)+offset]);
        q1 = _AVX_SUB(q1, x1);
        q2 = _AVX_SUB(q2, x2);
885
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
886 887
        q1 = _AVX_NFMA(y1, h2, q1);
        q2 = _AVX_NFMA(y2, h2, q2);
888
#else
Andreas Marek's avatar
Andreas Marek committed
889 890
        q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
        q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
891
#endif
Andreas Marek's avatar
Andreas Marek committed
892
        h3 = _AVX_BROADCAST(&hh[(ldh*2)+2]);
893
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
894 895
        q1 = _AVX_NFMA(z1, h3, q1);
        q2 = _AVX_NFMA(z2, h3, q2);
896
#else
Andreas Marek's avatar
Andreas Marek committed
897 898
        q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
        q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
899
#endif
Andreas Marek's avatar
Andreas Marek committed
900
        h4 = _AVX_BROADCAST(&hh[(ldh*3)+3]);
901
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
902 903
        q1 = _AVX_NFMA(w1, h4, q1);
        q2 = _AVX_NFMA(w2, h4, q2);
904
#else
Andreas Marek's avatar
Andreas Marek committed
905 906
        q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
        q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
907
#endif
Andreas Marek's avatar
Andreas Marek committed
908
        h5 = _AVX_BROADCAST(&hh[(ldh*4)+4]);
909
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
910 911
        q1 = _AVX_NFMA(v1, h5, q1);
        q2 = _AVX_NFMA(v2, h5, q2);
912
#else
Andreas Marek's avatar
Andreas Marek committed
913 914
        q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
        q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
915
#endif
Andreas Marek's avatar
Andreas Marek committed
916
        h6 = _AVX_BROADCAST(&hh[(ldh*5)+5]);
917
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
918 919
        q1 = _AVX_NFMA(t1, h6, q1);
        q2 = _AVX_NFMA(t2, h6, q2);
920
#else
Andreas Marek's avatar
Andreas Marek committed
921 922
        q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
        q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
923
#endif
Andreas Marek's avatar
Andreas Marek committed
924 925
        _AVX_STORE(&q[ldq*5],q1);
        _AVX_STORE(&q[(ldq*5)+offset],q2);
926

Andreas Marek's avatar
Andreas Marek committed
927 928 929 930 931
        for (i = 6; i < nb; i++)
        {
                q1 = _AVX_LOAD(&q[i*ldq]);
                q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
                h1 = _AVX_BROADCAST(&hh[i-5]);
932
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
933 934
                q1 = _AVX_NFMA(x1, h1, q1);
                q2 = _AVX_NFMA(x2, h1, q2);
935
#else
Andreas Marek's avatar
Andreas Marek committed
936 937
                q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
                q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
938
#endif
Andreas Marek's avatar
Andreas Marek committed
939
                h2 = _AVX_BROADCAST(&hh[ldh+i-4]);
940
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
941 942
                q1 = _AVX_NFMA(y1, h2, q1);
                q2 = _AVX_NFMA(y2, h2, q2);
943
#else
Andreas Marek's avatar
Andreas Marek committed
944 945
                q1 = _AVX_SUB(q1, _AVX_MUL(y1, h2));
                q2 = _AVX_SUB(q2, _AVX_MUL(y2, h2));
946
#endif
Andreas Marek's avatar
Andreas Marek committed
947
                h3 = _AVX_BROADCAST(&hh[(ldh*2)+i-3]);
948
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
949 950
                q1 = _AVX_NFMA(z1, h3, q1);
                q2 = _AVX_NFMA(z2, h3, q2);
951
#else
Andreas Marek's avatar
Andreas Marek committed
952 953
                q1 = _AVX_SUB(q1, _AVX_MUL(z1, h3));
                q2 = _AVX_SUB(q2, _AVX_MUL(z2, h3));
954
#endif
Andreas Marek's avatar
Andreas Marek committed
955
                h4 = _AVX_BROADCAST(&hh[(ldh*3)+i-2]);
956
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
957 958
                q1 = _AVX_NFMA(w1, h4, q1);
                q2 = _AVX_NFMA(w2, h4, q2);
959
#else
Andreas Marek's avatar
Andreas Marek committed
960 961
                q1 = _AVX_SUB(q1, _AVX_MUL(w1, h4));
                q2 = _AVX_SUB(q2, _AVX_MUL(w2, h4));
962
#endif
Andreas Marek's avatar
Andreas Marek committed
963
                h5 = _AVX_BROADCAST(&hh[(ldh*4)+i-1]);
964
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
965 966
                q1 = _AVX_NFMA(v1, h5, q1);
                q2 = _AVX_NFMA(v2, h5, q2);
967
#else
Andreas Marek's avatar
Andreas Marek committed
968 969
                q1 = _AVX_SUB(q1, _AVX_MUL(v1, h5));
                q2 = _AVX_SUB(q2, _AVX_MUL(v2, h5));
970
#endif
Andreas Marek's avatar
Andreas Marek committed
971
                h6 = _AVX_BROADCAST(&hh[(ldh*5)+i]);
972
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
973 974
                q1 = _AVX_NFMA(t1, h6, q1);
                q2 = _AVX_NFMA(t2, h6, q2);
975
#else
Andreas Marek's avatar
Andreas Marek committed
976 977
                q1 = _AVX_SUB(q1, _AVX_MUL(t1, h6));
                q2 = _AVX_SUB(q2, _AVX_MUL(t2, h6));
978
#endif
Andreas Marek's avatar
Andreas Marek committed
979 980 981
                _AVX_STORE(&q[i*ldq],q1);
                _AVX_STORE(&q[(i*ldq)+offset],q2);
        }
982

Andreas Marek's avatar
Andreas Marek committed
983 984 985
        h1 = _AVX_BROADCAST(&hh[nb-5]);
        q1 = _AVX_LOAD(&q[nb*ldq]);
        q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
986
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
987 988
        q1 = _AVX_NFMA(x1, h1, q1);
        q2 = _AVX_NFMA(x2, h1, q2);
989
#else
Andreas Marek's avatar
Andreas Marek committed
990 991
        q1 = _AVX_SUB(q1, _AVX_MUL(x1, h1));
        q2 = _AVX_SUB(q2, _AVX_MUL(x2, h1));
992
#endif
Andreas Marek's avatar
Andreas Marek committed
993
        h2 = _AVX_BROADCAST(&hh[ldh+nb-4]);
994
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
995 996
        q1 = _AVX_NFMA(y1, h2, q1);
        q2 = _AVX_NFMA(y2, h2, q2);