real_avx512_6hv_template.c 98.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"
#include <x86intrin.h>
50
#include <stdio.h>
Andreas Marek's avatar
Andreas Marek committed
51
#include <stdlib.h>
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 8
#define __AVX512_DATATYPE __m512d
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SUB _mm512_sub_pd

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
#define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
#define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)

#endif

#define _AVX512_FMA _mm512_FMA_pd
#define _AVX512_NFMA _mm512_NFMA_pd
#define _AVX512_FMSUB _mm512_FMSUB_pd
#endif /* DOUBLE_PRECISION_REAL */

#ifdef SINGLE_PRECISION_REAL
#define offset 16
#define __AVX512_DATATYPE __m512
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SUB _mm512_sub_ps

#ifdef HAVE_AVX512

#define __ELPA_USE_FMA__
#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
#define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
#define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
#endif

#define _AVX512_FMA _mm512_FMA_ps
#define _AVX512_NFMA _mm512_NFMA_ps
#define _AVX512_FMSUB _mm512_FMSUB_ps
#endif /* SINGLE_PRECISION_REAL */



//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
static void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);

void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
Andreas Marek's avatar
Andreas Marek committed
117 118
static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162

void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);

#endif

/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine hexa_hh_trafo_real_avx512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="hexa_hh_trafo_real_avx512_6hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/


/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="hexa_hh_trafo_real_avx512_6hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_float)      :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/


#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif


{
Andreas Marek's avatar
Andreas Marek committed
163 164 165 166 167 168
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
Andreas Marek's avatar
Andreas Marek committed
169

Andreas Marek's avatar
Andreas Marek committed
170
        worked_on = 0;
171

Andreas Marek's avatar
Andreas Marek committed
172 173
        // calculating scalar products to compute
        // 6 householder vectors simultaneously
174
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
175
        double scalarprods[15];
176 177
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
178
        float scalarprods[15];
179 180
#endif

Andreas Marek's avatar
Andreas Marek committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
        scalarprods[0] = hh[(ldh+1)];
        scalarprods[1] = hh[(ldh*2)+2];
        scalarprods[2] = hh[(ldh*2)+1];
        scalarprods[3] = hh[(ldh*3)+3];
        scalarprods[4] = hh[(ldh*3)+2];
        scalarprods[5] = hh[(ldh*3)+1];
        scalarprods[6] = hh[(ldh*4)+4];
        scalarprods[7] = hh[(ldh*4)+3];
        scalarprods[8] = hh[(ldh*4)+2];
        scalarprods[9] = hh[(ldh*4)+1];
        scalarprods[10] = hh[(ldh*5)+5];
        scalarprods[11] = hh[(ldh*5)+4];
        scalarprods[12] = hh[(ldh*5)+3];
        scalarprods[13] = hh[(ldh*5)+2];
        scalarprods[14] = hh[(ldh*5)+1];

        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
        scalarprods[0] += hh[1] * hh[(2+ldh)];
        scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];

        // loop counter = 3
        scalarprods[0] += hh[2] * hh[(3+ldh)];
        scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];

        scalarprods[1] += hh[1] * hh[3+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];

        // loop counter = 4
        scalarprods[0] += hh[3] * hh[(4+ldh)];
        scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];

        scalarprods[1] += hh[2] * hh[4+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];

        scalarprods[3] += hh[1] * hh[4+(ldh*3)];
        scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];

        // loop counter = 5
        scalarprods[0] += hh[4] * hh[(5+ldh)];
        scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];

        scalarprods[1] += hh[3] * hh[5+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];

        scalarprods[3] += hh[2] * hh[5+(ldh*3)];
        scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];

        scalarprods[6] += hh[1] * hh[5+(ldh*4)];
        scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];

        #pragma ivdep
        for (i = 6; i < nb; i++)
        {
                scalarprods[0] += hh[i-1] * hh[(i+ldh)];
                scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
                scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
                scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
                scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];

                scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
                scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
                scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
                scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];

                scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
                scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
                scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];

                scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
                scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];

                scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
        }


        // Production level kernel calls with padding
278
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
279 280 281 282 283
        for (i = 0; i < nq-24; i+=32)
        {
                hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 32;
        }
284 285
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
286 287 288 289 290
        for (i = 0; i < nq-48; i+=64)
        {
                hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 64;
        }
291
#endif
Andreas Marek's avatar
Andreas Marek committed
292 293 294 295
        if (nq == i)
        {
                return;
        }
296
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
297 298 299 300 301
        if (nq-i == 24)
        {
                hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 24;
        }
Andreas Marek's avatar
Andreas Marek committed
302 303 304
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
305 306 307 308 309
        if (nq-i ==48)
        {
                hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 48;
        }
Andreas Marek's avatar
Andreas Marek committed
310 311 312
#endif

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
313 314 315 316
        if (nq-i == 16)
        {
                hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 16;
Andreas Marek's avatar
Andreas Marek committed
317 318 319 320
        }
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
321 322 323 324 325
        if (nq-i ==32)
        {
                hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 32;
        }
Andreas Marek's avatar
Andreas Marek committed
326 327 328
#endif

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
329 330 331 332
        if (nq-i == 8)
        {
                hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 8;
333
        }
334
#endif
Andreas Marek's avatar
Andreas Marek committed
335

336
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
337 338 339 340 341
        if (nq-i == 16)
        {
                hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
                worked_on += 16;
        }
342
#endif
343 344

#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
345
        if (worked_on != nq)
Andreas Marek's avatar
Andreas Marek committed
346
        {
347
          printf("ERROR in avx512 kernel\n");
Andreas Marek's avatar
Andreas Marek committed
348
          abort();
Andreas Marek's avatar
Andreas Marek committed
349
        }
350
#endif
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif

{
Andreas Marek's avatar
Andreas Marek committed
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [8 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;

        __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
        __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
        __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
        __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
        __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
        __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);

        __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
        __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
        __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
        __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
        __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);

//        register __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
392 393
        __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);

Andreas Marek's avatar
Andreas Marek committed
394 395 396 397
        t1 = _AVX512_FMA(a4_1, h_6_4, t1);
        t1 = _AVX512_FMA(a3_1, h_6_3, t1);
        t1 = _AVX512_FMA(a2_1, h_6_2, t1);
        t1 = _AVX512_FMA(a1_1, h_6_1, t1);
398

Andreas Marek's avatar
Andreas Marek committed
399 400 401 402
        __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
        __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
        __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
        __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
403

Andreas Marek's avatar
Andreas Marek committed
404
//        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
405 406
        __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);

Andreas Marek's avatar
Andreas Marek committed
407 408 409
        v1 = _AVX512_FMA(a3_1, h_5_3, v1);
        v1 = _AVX512_FMA(a2_1, h_5_2, v1);
        v1 = _AVX512_FMA(a1_1, h_5_1, v1);
410

Andreas Marek's avatar
Andreas Marek committed
411 412 413
        __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
        __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
        __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
414

Andreas Marek's avatar
Andreas Marek committed
415
//        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
416 417
        __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);

Andreas Marek's avatar
Andreas Marek committed
418 419
        w1 = _AVX512_FMA(a2_1, h_4_2, w1);
        w1 = _AVX512_FMA(a1_1, h_4_1, w1);
420

Andreas Marek's avatar
Andreas Marek committed
421 422 423
        __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
        __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
424

Andreas Marek's avatar
Andreas Marek committed
425
//        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
426 427
        __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);

Andreas Marek's avatar
Andreas Marek committed
428 429
        z1 = _AVX512_FMA(a1_1, h_3_1, z1);
//        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
430 431
         __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);

Andreas Marek's avatar
Andreas Marek committed
432
//        register __AVX512_DATATYPE x1 = a1_1;
433 434
        __AVX512_DATATYPE x1 = a1_1;

Andreas Marek's avatar
Andreas Marek committed
435
        __AVX512_DATATYPE q1;
436

Andreas Marek's avatar
Andreas Marek committed
437 438 439 440 441 442
        __AVX512_DATATYPE h1;
        __AVX512_DATATYPE h2;
        __AVX512_DATATYPE h3;
        __AVX512_DATATYPE h4;
        __AVX512_DATATYPE h5;
        __AVX512_DATATYPE h6;
443

Andreas Marek's avatar
Andreas Marek committed
444 445 446 447
        for(i = 6; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-5]);
                q1 = _AVX512_LOAD(&q[i*ldq]);
448

Andreas Marek's avatar
Andreas Marek committed
449
                x1 = _AVX512_FMA(q1, h1, x1);
450

Andreas Marek's avatar
Andreas Marek committed
451
                h2 = _AVX512_SET1(hh[ldh+i-4]);
452

Andreas Marek's avatar
Andreas Marek committed
453 454
                y1 = _AVX512_FMA(q1, h2, y1);
                h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
455

Andreas Marek's avatar
Andreas Marek committed
456
                z1 = _AVX512_FMA(q1, h3, z1);
457

Andreas Marek's avatar
Andreas Marek committed
458
                h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
459

Andreas Marek's avatar
Andreas Marek committed
460
                w1 = _AVX512_FMA(q1, h4, w1);
461

Andreas Marek's avatar
Andreas Marek committed
462
                h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
463

Andreas Marek's avatar
Andreas Marek committed
464
                v1 = _AVX512_FMA(q1, h5, v1);
465

Andreas Marek's avatar
Andreas Marek committed
466
                h6 = _AVX512_SET1(hh[(ldh*5)+i]);
467

Andreas Marek's avatar
Andreas Marek committed
468 469
                t1 = _AVX512_FMA(q1, h6, t1);
        }
470

Andreas Marek's avatar
Andreas Marek committed
471 472
        h1 = _AVX512_SET1(hh[nb-5]);
        q1 = _AVX512_LOAD(&q[nb*ldq]);
473

Andreas Marek's avatar
Andreas Marek committed
474
        x1 = _AVX512_FMA(q1, h1, x1);
475

Andreas Marek's avatar
Andreas Marek committed
476
        h2 = _AVX512_SET1(hh[ldh+nb-4]);
477

Andreas Marek's avatar
Andreas Marek committed
478
        y1 = _AVX512_FMA(q1, h2, y1);
479

Andreas Marek's avatar
Andreas Marek committed
480
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
481

Andreas Marek's avatar
Andreas Marek committed
482
        z1 = _AVX512_FMA(q1, h3, z1);
483

Andreas Marek's avatar
Andreas Marek committed
484
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
485

Andreas Marek's avatar
Andreas Marek committed
486
        w1 = _AVX512_FMA(q1, h4, w1);
487

Andreas Marek's avatar
Andreas Marek committed
488
        h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
489

Andreas Marek's avatar
Andreas Marek committed
490
        v1 = _AVX512_FMA(q1, h5, v1);
491

Andreas Marek's avatar
Andreas Marek committed
492
        h1 = _AVX512_SET1(hh[nb-4]);
493

Andreas Marek's avatar
Andreas Marek committed
494
        q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
495

Andreas Marek's avatar
Andreas Marek committed
496
        x1 = _AVX512_FMA(q1, h1, x1);
497

Andreas Marek's avatar
Andreas Marek committed
498
        h2 = _AVX512_SET1(hh[ldh+nb-3]);
499

Andreas Marek's avatar
Andreas Marek committed
500
        y1 = _AVX512_FMA(q1, h2, y1);
501

Andreas Marek's avatar
Andreas Marek committed
502
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
503

Andreas Marek's avatar
Andreas Marek committed
504
        z1 = _AVX512_FMA(q1, h3, z1);
505

Andreas Marek's avatar
Andreas Marek committed
506
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
507

Andreas Marek's avatar
Andreas Marek committed
508
        w1 = _AVX512_FMA(q1, h4, w1);
509

Andreas Marek's avatar
Andreas Marek committed
510 511
        h1 = _AVX512_SET1(hh[nb-3]);
        q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
512

Andreas Marek's avatar
Andreas Marek committed
513
        x1 = _AVX512_FMA(q1, h1, x1);
514

Andreas Marek's avatar
Andreas Marek committed
515
        h2 = _AVX512_SET1(hh[ldh+nb-2]);
516

Andreas Marek's avatar
Andreas Marek committed
517
        y1 = _AVX512_FMA(q1, h2, y1);
518

Andreas Marek's avatar
Andreas Marek committed
519
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
520

Andreas Marek's avatar
Andreas Marek committed
521
        z1 = _AVX512_FMA(q1, h3, z1);
522

Andreas Marek's avatar
Andreas Marek committed
523 524
        h1 = _AVX512_SET1(hh[nb-2]);
        q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
525

Andreas Marek's avatar
Andreas Marek committed
526
        x1 = _AVX512_FMA(q1, h1, x1);
527

Andreas Marek's avatar
Andreas Marek committed
528
        h2 = _AVX512_SET1(hh[ldh+nb-1]);
529

Andreas Marek's avatar
Andreas Marek committed
530
        y1 = _AVX512_FMA(q1, h2, y1);
531

Andreas Marek's avatar
Andreas Marek committed
532 533
        h1 = _AVX512_SET1(hh[nb-1]);
        q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
534

Andreas Marek's avatar
Andreas Marek committed
535
        x1 = _AVX512_FMA(q1, h1, x1);
536

Andreas Marek's avatar
Andreas Marek committed
537 538 539
        /////////////////////////////////////////////////////
        // Apply tau, correct wrong calculation using pre-calculated scalar products
        /////////////////////////////////////////////////////
540

Andreas Marek's avatar
Andreas Marek committed
541 542
        __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
        x1 = _AVX512_MUL(x1, tau1);
543

Andreas Marek's avatar
Andreas Marek committed
544 545 546
        __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
        __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
        h2 = _AVX512_MUL(tau2, vs_1_2);
547

Andreas Marek's avatar
Andreas Marek committed
548
        y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
549

Andreas Marek's avatar
Andreas Marek committed
550 551 552
        __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
        __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
        __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
553

Andreas Marek's avatar
Andreas Marek committed
554 555
        h2 = _AVX512_MUL(tau3, vs_1_3);
        h3 = _AVX512_MUL(tau3, vs_2_3);
556

Andreas Marek's avatar
Andreas Marek committed
557
        z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
558

Andreas Marek's avatar
Andreas Marek committed
559 560 561
        __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
        __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
        __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
562

Andreas Marek's avatar
Andreas Marek committed
563 564
        h2 = _AVX512_MUL(tau4, vs_1_4);
        h3 = _AVX512_MUL(tau4, vs_2_4);
565

Andreas Marek's avatar
Andreas Marek committed
566 567
        __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
        h4 = _AVX512_MUL(tau4, vs_3_4);
568

Andreas Marek's avatar
Andreas Marek committed
569
        w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
570

Andreas Marek's avatar
Andreas Marek committed
571 572 573
        __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
        __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
        __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
574

Andreas Marek's avatar
Andreas Marek committed
575 576
        h2 = _AVX512_MUL(tau5, vs_1_5);
        h3 = _AVX512_MUL(tau5, vs_2_5);
577

Andreas Marek's avatar
Andreas Marek committed
578 579
        __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
        __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
580

Andreas Marek's avatar
Andreas Marek committed
581 582
        h4 = _AVX512_MUL(tau5, vs_3_5);
        h5 = _AVX512_MUL(tau5, vs_4_5);
583

Andreas Marek's avatar
Andreas Marek committed
584
        v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
585

Andreas Marek's avatar
Andreas Marek committed
586 587 588 589 590
        __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
        __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
        __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
        h2 = _AVX512_MUL(tau6, vs_1_6);
        h3 = _AVX512_MUL(tau6, vs_2_6);
591

Andreas Marek's avatar
Andreas Marek committed
592 593 594
        __AVX512_DATATYPE vs_3_6 = _AVX512_SET1(scalarprods[12]);
        __AVX512_DATATYPE vs_4_6 = _AVX512_SET1(scalarprods[13]);
        __AVX512_DATATYPE vs_5_6 = _AVX512_SET1(scalarprods[14]);
595

Andreas Marek's avatar
Andreas Marek committed
596 597 598
        h4 = _AVX512_MUL(tau6, vs_3_6);
        h5 = _AVX512_MUL(tau6, vs_4_6);
        h6 = _AVX512_MUL(tau6, vs_5_6);
599

Andreas Marek's avatar
Andreas Marek committed
600
        t1 = _AVX512_FMSUB(t1, tau6, _AVX512_FMA(v1, h6, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)))));
601

Andreas Marek's avatar
Andreas Marek committed
602 603 604
        /////////////////////////////////////////////////////
        // Rank-1 update of Q [8 x nb+3]
        /////////////////////////////////////////////////////
605

Andreas Marek's avatar
Andreas Marek committed
606 607 608
        q1 = _AVX512_LOAD(&q[0]);
        q1 = _AVX512_SUB(q1, t1);
        _AVX512_STORE(&q[0],q1);
609

Andreas Marek's avatar
Andreas Marek committed
610 611 612
        h6 = _AVX512_SET1(hh[(ldh*5)+1]);
        q1 = _AVX512_LOAD(&q[ldq]);
        q1 = _AVX512_SUB(q1, v1);
613

Andreas Marek's avatar
Andreas Marek committed
614
        q1 = _AVX512_NFMA(t1, h6, q1);
615

Andreas Marek's avatar
Andreas Marek committed
616
        _AVX512_STORE(&q[ldq],q1);
617

Andreas Marek's avatar
Andreas Marek committed
618 619 620
        h5 = _AVX512_SET1(hh[(ldh*4)+1]);
        q1 = _AVX512_LOAD(&q[ldq*2]);
        q1 = _AVX512_SUB(q1, w1);
621

Andreas Marek's avatar
Andreas Marek committed
622
        q1 = _AVX512_NFMA(v1, h5, q1);
623

Andreas Marek's avatar
Andreas Marek committed
624
        h6 = _AVX512_SET1(hh[(ldh*5)+2]);
625

Andreas Marek's avatar
Andreas Marek committed
626
        q1 = _AVX512_NFMA(t1, h6, q1);
627

Andreas Marek's avatar
Andreas Marek committed
628
        _AVX512_STORE(&q[ldq*2],q1);
629

Andreas Marek's avatar
Andreas Marek committed
630 631 632
        h4 = _AVX512_SET1(hh[(ldh*3)+1]);
        q1 = _AVX512_LOAD(&q[ldq*3]);
        q1 = _AVX512_SUB(q1, z1);
633

Andreas Marek's avatar
Andreas Marek committed
634
        q1 = _AVX512_NFMA(w1, h4, q1);
635

Andreas Marek's avatar
Andreas Marek committed
636
        h5 = _AVX512_SET1(hh[(ldh*4)+2]);
637

Andreas Marek's avatar
Andreas Marek committed
638
        q1 = _AVX512_NFMA(v1, h5, q1);
639

Andreas Marek's avatar
Andreas Marek committed
640
        h6 = _AVX512_SET1(hh[(ldh*5)+3]);
641

Andreas Marek's avatar
Andreas Marek committed
642
        q1 = _AVX512_NFMA(t1, h6, q1);
643

Andreas Marek's avatar
Andreas Marek committed
644
        _AVX512_STORE(&q[ldq*3],q1);
645

Andreas Marek's avatar
Andreas Marek committed
646 647 648
        h3 = _AVX512_SET1(hh[(ldh*2)+1]);
        q1 = _AVX512_LOAD(&q[ldq*4]);
        q1 = _AVX512_SUB(q1, y1);
649

Andreas Marek's avatar
Andreas Marek committed
650
        q1 = _AVX512_NFMA(z1, h3, q1);
651

Andreas Marek's avatar
Andreas Marek committed
652
        h4 = _AVX512_SET1(hh[(ldh*3)+2]);
653

Andreas Marek's avatar
Andreas Marek committed
654
        q1 = _AVX512_NFMA(w1, h4, q1);
655

Andreas Marek's avatar
Andreas Marek committed
656
        h5 = _AVX512_SET1(hh[(ldh*4)+3]);
657

Andreas Marek's avatar
Andreas Marek committed
658
        q1 = _AVX512_NFMA(v1, h5, q1);
659

Andreas Marek's avatar
Andreas Marek committed
660
        h6 = _AVX512_SET1(hh[(ldh*5)+4]);
661

Andreas Marek's avatar
Andreas Marek committed
662
        q1 = _AVX512_NFMA(t1, h6, q1);
663

Andreas Marek's avatar
Andreas Marek committed
664
        _AVX512_STORE(&q[ldq*4],q1);
665

Andreas Marek's avatar
Andreas Marek committed
666 667 668
        h2 = _AVX512_SET1(hh[(ldh)+1]);
        q1 = _AVX512_LOAD(&q[ldq*5]);
        q1 = _AVX512_SUB(q1, x1);
669

Andreas Marek's avatar
Andreas Marek committed
670
        q1 = _AVX512_NFMA(y1, h2, q1);
671

Andreas Marek's avatar
Andreas Marek committed
672
        h3 = _AVX512_SET1(hh[(ldh*2)+2]);
673

Andreas Marek's avatar
Andreas Marek committed
674
        q1 = _AVX512_NFMA(z1, h3, q1);
675

Andreas Marek's avatar
Andreas Marek committed
676
        h4 = _AVX512_SET1(hh[(ldh*3)+3]);
677

Andreas Marek's avatar
Andreas Marek committed
678
        q1 = _AVX512_NFMA(w1, h4, q1);
679

Andreas Marek's avatar
Andreas Marek committed
680
        h5 = _AVX512_SET1(hh[(ldh*4)+4]);
681

Andreas Marek's avatar
Andreas Marek committed
682
        q1 = _AVX512_NFMA(v1, h5, q1);
683

Andreas Marek's avatar
Andreas Marek committed
684
        h6 = _AVX512_SET1(hh[(ldh*5)+5]);
685

Andreas Marek's avatar
Andreas Marek committed
686
        q1 = _AVX512_NFMA(t1, h6, q1);
687

Andreas Marek's avatar
Andreas Marek committed
688
        _AVX512_STORE(&q[ldq*5],q1);
689

Andreas Marek's avatar
Andreas Marek committed
690 691 692 693
        for (i = 6; i < nb; i++)
        {
                q1 = _AVX512_LOAD(&q[i*ldq]);
                h1 = _AVX512_SET1(hh[i-5]);
694

Andreas Marek's avatar
Andreas Marek committed
695
                q1 = _AVX512_NFMA(x1, h1, q1);
696

Andreas Marek's avatar
Andreas Marek committed
697
                h2 = _AVX512_SET1(hh[ldh+i-4]);
698

Andreas Marek's avatar
Andreas Marek committed
699
                q1 = _AVX512_NFMA(y1, h2, q1);
700

Andreas Marek's avatar
Andreas Marek committed
701
                h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
702

Andreas Marek's avatar
Andreas Marek committed
703
                q1 = _AVX512_NFMA(z1, h3, q1);
704

Andreas Marek's avatar
Andreas Marek committed
705
                h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
706

Andreas Marek's avatar
Andreas Marek committed
707
                q1 = _AVX512_NFMA(w1, h4, q1);
708

Andreas Marek's avatar
Andreas Marek committed
709
                h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
710

Andreas Marek's avatar
Andreas Marek committed
711
                q1 = _AVX512_NFMA(v1, h5, q1);
712

Andreas Marek's avatar
Andreas Marek committed
713
                h6 = _AVX512_SET1(hh[(ldh*5)+i]);
714

Andreas Marek's avatar
Andreas Marek committed
715
                q1 = _AVX512_NFMA(t1, h6, q1);
716

Andreas Marek's avatar
Andreas Marek committed
717 718
                _AVX512_STORE(&q[i*ldq],q1);
        }
719

Andreas Marek's avatar
Andreas Marek committed
720 721
        h1 = _AVX512_SET1(hh[nb-5]);
        q1 = _AVX512_LOAD(&q[nb*ldq]);
722

Andreas Marek's avatar
Andreas Marek committed
723
        q1 = _AVX512_NFMA(x1, h1, q1);
724

Andreas Marek's avatar
Andreas Marek committed
725
        h2 = _AVX512_SET1(hh[ldh+nb-4]);
726

Andreas Marek's avatar
Andreas Marek committed
727
        q1 = _AVX512_NFMA(y1, h2, q1);
728

Andreas Marek's avatar
Andreas Marek committed
729
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
730

Andreas Marek's avatar
Andreas Marek committed
731
        q1 = _AVX512_NFMA(z1, h3, q1);
732

Andreas Marek's avatar
Andreas Marek committed
733
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
734

Andreas Marek's avatar
Andreas Marek committed
735
        q1 = _AVX512_NFMA(w1, h4, q1);
736

Andreas Marek's avatar
Andreas Marek committed
737
        h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
738

Andreas Marek's avatar
Andreas Marek committed
739
        q1 = _AVX512_NFMA(v1, h5, q1);
740

Andreas Marek's avatar
Andreas Marek committed
741
        _AVX512_STORE(&q[nb*ldq],q1);
742

Andreas Marek's avatar
Andreas Marek committed
743 744
        h1 = _AVX512_SET1(hh[nb-4]);
        q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
745

Andreas Marek's avatar
Andreas Marek committed
746
        q1 = _AVX512_NFMA(x1, h1, q1);
747

Andreas Marek's avatar
Andreas Marek committed
748
        h2 = _AVX512_SET1(hh[ldh+nb-3]);
749

Andreas Marek's avatar
Andreas Marek committed
750
        q1 = _AVX512_NFMA(y1, h2, q1);
751

Andreas Marek's avatar
Andreas Marek committed
752
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
753

Andreas Marek's avatar
Andreas Marek committed
754
        q1 = _AVX512_NFMA(z1, h3, q1);
755

Andreas Marek's avatar
Andreas Marek committed
756
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
757

Andreas Marek's avatar
Andreas Marek committed
758
        q1 = _AVX512_NFMA(w1, h4, q1);
759

Andreas Marek's avatar
Andreas Marek committed
760
        _AVX512_STORE(&q[(nb+1)*ldq],q1);
761

Andreas Marek's avatar
Andreas Marek committed
762 763
        h1 = _AVX512_SET1(hh[nb-3]);
        q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
764

Andreas Marek's avatar
Andreas Marek committed
765
        q1 = _AVX512_NFMA(x1, h1, q1);
766

Andreas Marek's avatar
Andreas Marek committed
767
        h2 = _AVX512_SET1(hh[ldh+nb-2]);
768

Andreas Marek's avatar
Andreas Marek committed
769
        q1 = _AVX512_NFMA(y1, h2, q1);
770

Andreas Marek's avatar
Andreas Marek committed
771
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
772

Andreas Marek's avatar
Andreas Marek committed
773
        q1 = _AVX512_NFMA(z1, h3, q1);
774

Andreas Marek's avatar
Andreas Marek committed
775
        _AVX512_STORE(&q[(nb+2)*ldq],q1);
776

Andreas Marek's avatar
Andreas Marek committed
777 778
        h1 = _AVX512_SET1(hh[nb-2]);
        q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
779

Andreas Marek's avatar
Andreas Marek committed
780
        q1 = _AVX512_NFMA(x1, h1, q1);
781

Andreas Marek's avatar
Andreas Marek committed
782
        h2 = _AVX512_SET1(hh[ldh+nb-1]);
783

Andreas Marek's avatar
Andreas Marek committed
784
        q1 = _AVX512_NFMA(y1, h2, q1);
785

Andreas Marek's avatar
Andreas Marek committed
786
        _AVX512_STORE(&q[(nb+3)*ldq],q1);
787

Andreas Marek's avatar
Andreas Marek committed
788 789
        h1 = _AVX512_SET1(hh[nb-1]);
        q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
790

Andreas Marek's avatar
Andreas Marek committed
791
        q1 = _AVX512_NFMA(x1, h1, q1);
792

Andreas Marek's avatar
Andreas Marek committed
793
        _AVX512_STORE(&q[(nb+4)*ldq],q1);
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 32 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [8 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;

        __AVX512_DATATYPE a1_1 = _AVX512_LOAD(&q[ldq*5]);
        __AVX512_DATATYPE a2_1 = _AVX512_LOAD(&q[ldq*4]);
        __AVX512_DATATYPE a3_1 = _AVX512_LOAD(&q[ldq*3]);
        __AVX512_DATATYPE a4_1 = _AVX512_LOAD(&q[ldq*2]);
        __AVX512_DATATYPE a5_1 = _AVX512_LOAD(&q[ldq]);
        __AVX512_DATATYPE a6_1 = _AVX512_LOAD(&q[0]);

        __AVX512_DATATYPE h_6_5 = _AVX512_SET1(hh[(ldh*5)+1]);
        __AVX512_DATATYPE h_6_4 = _AVX512_SET1(hh[(ldh*5)+2]);
        __AVX512_DATATYPE h_6_3 = _AVX512_SET1(hh[(ldh*5)+3]);
        __AVX512_DATATYPE h_6_2 = _AVX512_SET1(hh[(ldh*5)+4]);
        __AVX512_DATATYPE h_6_1 = _AVX512_SET1(hh[(ldh*5)+5]);

//        register__AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);
834 835
        __AVX512_DATATYPE t1 = _AVX512_FMA(a5_1, h_6_5, a6_1);

Andreas Marek's avatar
Andreas Marek committed
836 837 838 839
        t1 = _AVX512_FMA(a4_1, h_6_4, t1);
        t1 = _AVX512_FMA(a3_1, h_6_3, t1);
        t1 = _AVX512_FMA(a2_1, h_6_2, t1);
        t1 = _AVX512_FMA(a1_1, h_6_1, t1);
840

Andreas Marek's avatar
Andreas Marek committed
841 842 843 844
        __AVX512_DATATYPE h_5_4 = _AVX512_SET1(hh[(ldh*4)+1]);
        __AVX512_DATATYPE h_5_3 = _AVX512_SET1(hh[(ldh*4)+2]);
        __AVX512_DATATYPE h_5_2 = _AVX512_SET1(hh[(ldh*4)+3]);
        __AVX512_DATATYPE h_5_1 = _AVX512_SET1(hh[(ldh*4)+4]);
845

Andreas Marek's avatar
Andreas Marek committed
846
//        register __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);
847 848
        __AVX512_DATATYPE v1 = _AVX512_FMA(a4_1, h_5_4, a5_1);

Andreas Marek's avatar
Andreas Marek committed
849 850 851
        v1 = _AVX512_FMA(a3_1, h_5_3, v1);
        v1 = _AVX512_FMA(a2_1, h_5_2, v1);
        v1 = _AVX512_FMA(a1_1, h_5_1, v1);
852

Andreas Marek's avatar
Andreas Marek committed
853 854 855
        __AVX512_DATATYPE h_4_3 = _AVX512_SET1(hh[(ldh*3)+1]);
        __AVX512_DATATYPE h_4_2 = _AVX512_SET1(hh[(ldh*3)+2]);
        __AVX512_DATATYPE h_4_1 = _AVX512_SET1(hh[(ldh*3)+3]);
856

Andreas Marek's avatar
Andreas Marek committed
857
//        register __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);
858 859
        __AVX512_DATATYPE w1 = _AVX512_FMA(a3_1, h_4_3, a4_1);

Andreas Marek's avatar
Andreas Marek committed
860 861
        w1 = _AVX512_FMA(a2_1, h_4_2, w1);
        w1 = _AVX512_FMA(a1_1, h_4_1, w1);
862

Andreas Marek's avatar
Andreas Marek committed
863 864 865
        __AVX512_DATATYPE h_2_1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h_3_2 = _AVX512_SET1(hh[(ldh*2)+1]);
        __AVX512_DATATYPE h_3_1 = _AVX512_SET1(hh[(ldh*2)+2]);
866

Andreas Marek's avatar
Andreas Marek committed
867
//        register __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);
868 869
        __AVX512_DATATYPE z1 = _AVX512_FMA(a2_1, h_3_2, a3_1);

Andreas Marek's avatar
Andreas Marek committed
870 871
        z1 = _AVX512_FMA(a1_1, h_3_1, z1);
//        register __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);
872 873 874
        __AVX512_DATATYPE y1 = _AVX512_FMA(a1_1, h_2_1, a2_1);


Andreas Marek's avatar
Andreas Marek committed
875
//        register __AVX512_DATATYPE x1 = a1_1;
876 877 878
        __AVX512_DATATYPE x1 = a1_1;


Andreas Marek's avatar
Andreas Marek committed
879 880 881 882 883 884
        __AVX512_DATATYPE a1_2 = _AVX512_LOAD(&q[(ldq*5)+offset]);
        __AVX512_DATATYPE a2_2 = _AVX512_LOAD(&q[(ldq*4)+offset]);
        __AVX512_DATATYPE a3_2 = _AVX512_LOAD(&q[(ldq*3)+offset]);
        __AVX512_DATATYPE a4_2 = _AVX512_LOAD(&q[(ldq*2)+offset]);
        __AVX512_DATATYPE a5_2 = _AVX512_LOAD(&q[(ldq)+offset]);
        __AVX512_DATATYPE a6_2 = _AVX512_LOAD(&q[0+offset]);
885

Andreas Marek's avatar
Andreas Marek committed
886
//        register __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);
887 888
        __AVX512_DATATYPE t2 = _AVX512_FMA(a5_2, h_6_5, a6_2);

Andreas Marek's avatar
Andreas Marek committed
889 890 891 892
        t2 = _AVX512_FMA(a4_2, h_6_4, t2);
        t2 = _AVX512_FMA(a3_2, h_6_3, t2);
        t2 = _AVX512_FMA(a2_2, h_6_2, t2);
        t2 = _AVX512_FMA(a1_2, h_6_1, t2);
893

Andreas Marek's avatar
Andreas Marek committed
894
//        register __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);
895 896
        __AVX512_DATATYPE v2 = _AVX512_FMA(a4_2, h_5_4, a5_2);

Andreas Marek's avatar
Andreas Marek committed
897 898 899
        v2 = _AVX512_FMA(a3_2, h_5_3, v2);
        v2 = _AVX512_FMA(a2_2, h_5_2, v2);
        v2 = _AVX512_FMA(a1_2, h_5_1, v2);
900

Andreas Marek's avatar
Andreas Marek committed
901
//        register __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);
902 903
        __AVX512_DATATYPE w2 = _AVX512_FMA(a3_2, h_4_3, a4_2);

Andreas Marek's avatar
Andreas Marek committed
904 905
        w2 = _AVX512_FMA(a2_2, h_4_2, w2);
        w2 = _AVX512_FMA(a1_2, h_4_1, w2);
906

Andreas Marek's avatar
Andreas Marek committed
907
//        register __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);
908 909
        __AVX512_DATATYPE z2 = _AVX512_FMA(a2_2, h_3_2, a3_2);

Andreas Marek's avatar
Andreas Marek committed
910 911
        z2 = _AVX512_FMA(a1_2, h_3_1, z2);
//        register __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);
912 913 914
        __AVX512_DATATYPE y2 = _AVX512_FMA(a1_2, h_2_1, a2_2);


Andreas Marek's avatar
Andreas Marek committed
915
//        register __AVX512_DATATYPE x2 = a1_2;
916 917
        __AVX512_DATATYPE x2 = a1_2;

Andreas Marek's avatar
Andreas Marek committed
918 919
        __AVX512_DATATYPE q1;
        __AVX512_DATATYPE q2;
920

Andreas Marek's avatar
Andreas Marek committed
921 922 923 924 925 926
        __AVX512_DATATYPE h1;
        __AVX512_DATATYPE h2;
        __AVX512_DATATYPE h3;
        __AVX512_DATATYPE h4;
        __AVX512_DATATYPE h5;
        __AVX512_DATATYPE h6;
927

Andreas Marek's avatar
Andreas Marek committed
928 929 930 931 932
        for(i = 6; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-5]);
                q1 = _AVX512_LOAD(&q[i*ldq]);
                q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
933

Andreas Marek's avatar
Andreas Marek committed
934 935
                x1 = _AVX512_FMA(q1, h1, x1);
                x2 = _AVX512_FMA(q2, h1, x2);
936

Andreas Marek's avatar
Andreas Marek committed
937
                h2 = _AVX512_SET1(hh[ldh+i-4]);
938

Andreas Marek's avatar
Andreas Marek committed
939 940
                y1 = _AVX512_FMA(q1, h2, y1);
                y2 = _AVX512_FMA(q2, h2, y2);
941

Andreas Marek's avatar
Andreas Marek committed
942
                h3 = _AVX512_SET1(hh[(ldh*2)+i-3]);
943

Andreas Marek's avatar
Andreas Marek committed
944 945
                z1 = _AVX512_FMA(q1, h3, z1);
                z2 = _AVX512_FMA(q2, h3, z2);
946

Andreas Marek's avatar
Andreas Marek committed
947
                h4 = _AVX512_SET1(hh[(ldh*3)+i-2]);
948

Andreas Marek's avatar
Andreas Marek committed
949 950
                w1 = _AVX512_FMA(q1, h4, w1);
                w2 = _AVX512_FMA(q2, h4, w2);
951

Andreas Marek's avatar
Andreas Marek committed
952
                h5 = _AVX512_SET1(hh[(ldh*4)+i-1]);
953

Andreas Marek's avatar
Andreas Marek committed
954 955
                v1 = _AVX512_FMA(q1, h5, v1);
                v2 = _AVX512_FMA(q2, h5, v2);
956

Andreas Marek's avatar
Andreas Marek committed
957
                h6 = _AVX512_SET1(hh[(ldh*5)+i]);
958

Andreas Marek's avatar
Andreas Marek committed
959 960 961
                t1 = _AVX512_FMA(q1, h6, t1);
                t2 = _AVX512_FMA(q2, h6, t2);
        }
962

Andreas Marek's avatar
Andreas Marek committed
963 964 965
        h1 = _AVX512_SET1(hh[nb-5]);
        q1 = _AVX512_LOAD(&q[nb*ldq]);
        q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
966

Andreas Marek's avatar
Andreas Marek committed
967 968
        x1 = _AVX512_FMA(q1, h1, x1);
        x2 = _AVX512_FMA(q2, h1, x2);
969

Andreas Marek's avatar
Andreas Marek committed
970
        h2 = _AVX512_SET1(hh[ldh+nb-4]);
971

Andreas Marek's avatar
Andreas Marek committed
972 973
        y1 = _AVX512_FMA(q1, h2, y1);
        y2 = _AVX512_FMA(q2, h2, y2);
974

Andreas Marek's avatar
Andreas Marek committed
975
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-3]);
976

Andreas Marek's avatar
Andreas Marek committed
977 978
        z1 = _AVX512_FMA(q1, h3, z1);
        z2 = _AVX512_FMA(q2, h3, z2);
979

Andreas Marek's avatar
Andreas Marek committed
980
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-2]);
981

Andreas Marek's avatar
Andreas Marek committed
982 983
        w1 = _AVX512_FMA(q1, h4, w1);
        w2 = _AVX512_FMA(q2, h4, w2);
984

Andreas Marek's avatar
Andreas Marek committed
985
        h5 = _AVX512_SET1(hh[(ldh*4)+nb-1]);
986

Andreas Marek's avatar
Andreas Marek committed
987 988
        v1 = _AVX512_FMA(q1, h5, v1);
        v2 = _AVX512_FMA(q2, h5, v2);
989

Andreas Marek's avatar
Andreas Marek committed
990
        h1 = _AVX512_SET1(hh[nb-4]);
991

Andreas Marek's avatar
Andreas Marek committed
992 993
        q1 = _AVX512_LOAD(&q[(nb+1)*ldq]);
        q2 = _AVX512_LOAD(&q[((nb+1)*ldq)+offset]);
994

Andreas Marek's avatar
Andreas Marek committed
995 996
        x1 = _AVX512_FMA(q1, h1, x1);
        x2 = _AVX512_FMA(q2, h1, x2);
997

Andreas Marek's avatar
Andreas Marek committed
998
        h2 = _AVX512_SET1(hh[ldh+nb-3]);
999

Andreas Marek's avatar
Andreas Marek committed
1000 1001
        y1 = _AVX512_FMA(q1, h2, y1);
        y2 = _AVX512_FMA(q2, h2, y2);
1002

Andreas Marek's avatar
Andreas Marek committed
1003
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-2]);
1004

Andreas Marek's avatar
Andreas Marek committed
1005 1006
        z1 = _AVX512_FMA(q1, h3, z1);
        z2 = _AVX512_FMA(q2, h3, z2);
1007

Andreas Marek's avatar
Andreas Marek committed
1008
        h4 = _AVX512_SET1(hh[(ldh*3)+nb-1]);
1009

Andreas Marek's avatar
Andreas Marek committed
1010 1011
        w1 = _AVX512_FMA(q1, h4, w1);
        w2 = _AVX512_FMA(q2, h4, w2);
1012

Andreas Marek's avatar
Andreas Marek committed
1013 1014 1015
        h1 = _AVX512_SET1(hh[nb-3]);
        q1 = _AVX512_LOAD(&q[(nb+2)*ldq]);
        q2 = _AVX512_LOAD(&q[((nb+2)*ldq)+offset]);
1016

Andreas Marek's avatar
Andreas Marek committed
1017 1018
        x1 = _AVX512_FMA(q1, h1, x1);
        x2 = _AVX512_FMA(q2, h1, x2);
1019

Andreas Marek's avatar
Andreas Marek committed
1020
        h2 = _AVX512_SET1(hh[ldh+nb-2]);
1021

Andreas Marek's avatar
Andreas Marek committed
1022 1023
        y1 = _AVX512_FMA(q1, h2, y1);
        y2 = _AVX512_FMA(q2, h2, y2);
1024

Andreas Marek's avatar
Andreas Marek committed
1025
        h3 = _AVX512_SET1(hh[(ldh*2)+nb-1]);
1026

Andreas Marek's avatar
Andreas Marek committed
1027 1028
        z1 = _AVX512_FMA(q1, h3, z1);
        z2 = _AVX512_FMA(q2, h3, z2);
1029

Andreas Marek's avatar
Andreas Marek committed
1030 1031 1032
        h1 = _AVX512_SET1(hh[nb-2]);
        q1 = _AVX512_LOAD(&q[(nb+3)*ldq]);
        q2 = _AVX512_LOAD(&q[((nb+3)*ldq)+offset]);
1033

Andreas Marek's avatar
Andreas Marek committed
1034 1035
        x1 = _AVX512_FMA(q1, h1, x1);
        x2 = _AVX512_FMA(q2, h1, x2);
1036

Andreas Marek's avatar
Andreas Marek committed
1037
        h2 = _AVX512_SET1(hh[ldh+nb-1]);
1038

Andreas Marek's avatar
Andreas Marek committed
1039 1040
        y1 = _AVX512_FMA(q1, h2, y1);
        y2 = _AVX512_FMA(q2, h2, y2);
1041

Andreas Marek's avatar
Andreas Marek committed
1042 1043 1044
        h1 = _AVX512_SET1(hh[nb-1]);
        q1 = _AVX512_LOAD(&q[(nb+4)*ldq]);
        q2 = _AVX512_LOAD(&q[((nb+4)*ldq)+offset]);
1045

Andreas Marek's avatar
Andreas Marek committed
1046 1047
        x1 = _AVX512_FMA(q1, h1, x1);
        x2 = _AVX512_FMA(q2, h1, x2);
1048

Andreas Marek's avatar
Andreas Marek committed
1049 1050 1051
        /////////////////////////////////////////////////////
        // Apply tau, correct wrong calculation using pre-calculated scalar products
        /////////////////////////////////////////////////////
1052

Andreas Marek's avatar
Andreas Marek committed
1053 1054 1055
        __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
        x1 = _AVX512_MUL(x1, tau1);
        x2 = _AVX512_MUL(x2, tau1);
1056

Andreas Marek's avatar
Andreas Marek committed
1057 1058 1059
        __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
        __AVX512_DATATYPE vs_1_2 = _AVX512_SET1(scalarprods[0]);
        h2 = _AVX512_MUL(tau2, vs_1_2);
1060

Andreas Marek's avatar
Andreas Marek committed
1061 1062
        y1 = _AVX512_FMSUB(y1, tau2, _AVX512_MUL(x1,h2));
        y2 = _AVX512_FMSUB(y2, tau2, _AVX512_MUL(x2,h2));
1063

Andreas Marek's avatar
Andreas Marek committed
1064 1065 1066
        __AVX512_DATATYPE tau3 = _AVX512_SET1(hh[ldh*2]);
        __AVX512_DATATYPE vs_1_3 = _AVX512_SET1(scalarprods[1]);
        __AVX512_DATATYPE vs_2_3 = _AVX512_SET1(scalarprods[2]);
1067

Andreas Marek's avatar
Andreas Marek committed
1068 1069
        h2 = _AVX512_MUL(tau3, vs_1_3);
        h3 = _AVX512_MUL(tau3, vs_2_3);
1070

Andreas Marek's avatar
Andreas Marek committed
1071 1072
        z1 = _AVX512_FMSUB(z1, tau3, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2)));
        z2 = _AVX512_FMSUB(z2, tau3, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2)));
1073

Andreas Marek's avatar
Andreas Marek committed
1074 1075 1076
        __AVX512_DATATYPE tau4 = _AVX512_SET1(hh[ldh*3]);
        __AVX512_DATATYPE vs_1_4 = _AVX512_SET1(scalarprods[3]);
        __AVX512_DATATYPE vs_2_4 = _AVX512_SET1(scalarprods[4]);
1077

Andreas Marek's avatar
Andreas Marek committed
1078 1079
        h2 = _AVX512_MUL(tau4, vs_1_4);
        h3 = _AVX512_MUL(tau4, vs_2_4);
1080

Andreas Marek's avatar
Andreas Marek committed
1081 1082
        __AVX512_DATATYPE vs_3_4 = _AVX512_SET1(scalarprods[5]);
        h4 = _AVX512_MUL(tau4, vs_3_4);
1083

Andreas Marek's avatar
Andreas Marek committed
1084 1085
        w1 = _AVX512_FMSUB(w1, tau4, _AVX512_FMA(z1, h4, _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
        w2 = _AVX512_FMSUB(w2, tau4, _AVX512_FMA(z2, h4, _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1086

Andreas Marek's avatar
Andreas Marek committed
1087 1088 1089
        __AVX512_DATATYPE tau5 = _AVX512_SET1(hh[ldh*4]);
        __AVX512_DATATYPE vs_1_5 = _AVX512_SET1(scalarprods[6]);
        __AVX512_DATATYPE vs_2_5 = _AVX512_SET1(scalarprods[7]);
1090

Andreas Marek's avatar
Andreas Marek committed
1091 1092
        h2 = _AVX512_MUL(tau5, vs_1_5);
        h3 = _AVX512_MUL(tau5, vs_2_5);
1093

Andreas Marek's avatar
Andreas Marek committed
1094 1095
        __AVX512_DATATYPE vs_3_5 = _AVX512_SET1(scalarprods[8]);
        __AVX512_DATATYPE vs_4_5 = _AVX512_SET1(scalarprods[9]);
1096

Andreas Marek's avatar
Andreas Marek committed
1097 1098
        h4 = _AVX512_MUL(tau5, vs_3_5);
        h5 = _AVX512_MUL(tau5, vs_4_5);
1099

Andreas Marek's avatar
Andreas Marek committed
1100 1101
        v1 = _AVX512_FMSUB(v1, tau5, _AVX512_ADD(_AVX512_FMA(w1, h5, _AVX512_MUL(z1,h4)), _AVX512_FMA(y1, h3, _AVX512_MUL(x1,h2))));
        v2 = _AVX512_FMSUB(v2, tau5, _AVX512_ADD(_AVX512_FMA(w2, h5, _AVX512_MUL(z2,h4)), _AVX512_FMA(y2, h3, _AVX512_MUL(x2,h2))));
1102

Andreas Marek's avatar
Andreas Marek committed
1103 1104 1105 1106 1107
        __AVX512_DATATYPE tau6 = _AVX512_SET1(hh[ldh*5]);
        __AVX512_DATATYPE vs_1_6 = _AVX512_SET1(scalarprods[10]);
        __AVX512_DATATYPE vs_2_6 = _AVX512_SET1(scalarprods[11]);
        h2 = _AVX512_MUL(tau6, vs_1_6);
        h3 = _AVX512_MUL(tau6, vs_2_6);
1108