real_avx512_2hv_template.c 28 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

#include <x86intrin.h>
Andreas Marek's avatar
Andreas Marek committed
51 52
#include <stdio.h>
#include <stdlib.h>
53 54 55 56 57 58 59 60 61 62 63 64 65

#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 8

#define __AVX512_DATATYPE __m512d
#define __AVX512i __m512i
#define _AVX512_LOAD  _mm512_load_pd
#define _AVX512_STORE  _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
Andreas Marek's avatar
Andreas Marek committed
66 67 68
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87

#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
#endif

#define _AVX512_FMA _mm512_FMA_pd
#endif /* DOUBLE_PRECISION_REAL */

#ifdef SINGLE_PRECISION_REAL
#define offset 16

#define __AVX512_DATATYPE __m512
#define __AVX512i __m512i
#define _AVX512_LOAD  _mm512_load_ps
#define _AVX512_STORE  _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
Andreas Marek's avatar
Andreas Marek committed
88 89 90
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156

#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
#endif

#define _AVX512_FMA _mm512_FMA_ps
#endif /* SINGLE_PRECISION_REAL */

#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);

void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);

void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_real_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_real_avx512_2hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f>   subroutine double_hh_trafo_real_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_real_avx512_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     type(c_ptr), value      :: q
!f>     real(kind=c_float)      :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
157 158 159 160 161 162
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
Andreas Marek's avatar
Andreas Marek committed
163

Andreas Marek's avatar
Andreas Marek committed
164
        worked_on = 0;
165

Andreas Marek's avatar
Andreas Marek committed
166 167
        // calculating scalar product to compute
        // 2 householder vectors simultaneously
168
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
169
        double s = hh[(ldh)+1]*1.0;
170 171
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
172
        float s = hh[(ldh)+1]*1.0f;
173
#endif
Andreas Marek's avatar
Andreas Marek committed
174 175 176 177 178
        #pragma ivdep
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * hh[(i+ldh)];
        }
179

Andreas Marek's avatar
Andreas Marek committed
180
        // Production level kernel calls with padding
181
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
182 183 184 185 186
        for (i = 0; i < nq-24; i+=32)
        {
                hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 32;
        }
187 188
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
189 190 191 192 193 194 195 196 197 198
        for (i = 0; i < nq-48; i+=64)
        {
                hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 64;
        }
#endif
        if (nq == i)
        {
                return;
        }
199
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
200
        if (nq-i == 24)
Andreas Marek's avatar
Andreas Marek committed
201 202 203 204
        {
                hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 24;
        }
205 206
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
207 208 209 210 211
        if (nq-i == 48)
        {
                hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 48;
        }
212 213
#endif
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
214 215 216 217 218
        if (nq-i == 16)
        {
                hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
219 220
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
221 222 223 224 225
        if (nq-i == 32)
        {
                hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 32;
        }
226 227
#endif
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
228 229 230 231 232
        if (nq-i == 8)
        {
                hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
233 234
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
235 236 237 238 239
        if (nq-i == 16)
        {
                hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
240
#endif
241 242

#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
243
        if (worked_on != nq)
Andreas Marek's avatar
Andreas Marek committed
244 245 246 247
        {
                 printf("Error in AVX512 real BLOCK 2 kernel \n");
                 abort();
        }
248
#endif
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 32 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 64 rows of Q simultaneously, a
#endif

 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
270 271 272 273 274 275
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [24 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
        // Needed bit mask for floating point sign flip
276 277 278 279 280 281 282
#ifdef DOUBLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif

Andreas Marek's avatar
Andreas Marek committed
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
        __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
        __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
        __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
        __AVX512_DATATYPE x4 = _AVX512_LOAD(&q[ldq+3*offset]);


        __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h2;

        __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
        __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
        __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
        __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
        __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
        __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
        __AVX512_DATATYPE q4 = _AVX512_LOAD(&q[3*offset]);
        __AVX512_DATATYPE y4 = _AVX512_FMA(x4, h1, q4);

        for(i = 2; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-1]);
                h2 = _AVX512_SET1(hh[ldh+i]);

                q1 = _AVX512_LOAD(&q[i*ldq]);
                x1 = _AVX512_FMA(q1, h1, x1);
                y1 = _AVX512_FMA(q1, h2, y1);
                q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
                x2 = _AVX512_FMA(q2, h1, x2);
                y2 = _AVX512_FMA(q2, h2, y2);
                q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
                x3 = _AVX512_FMA(q3, h1, x3);
                y3 = _AVX512_FMA(q3, h2, y3);
                q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
                x4 = _AVX512_FMA(q4, h1, x4);
                y4 = _AVX512_FMA(q4, h2, y4);

        }

        h1 = _AVX512_SET1(hh[nb-1]);

        q1 = _AVX512_LOAD(&q[nb*ldq]);
        x1 = _AVX512_FMA(q1, h1, x1);
        q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
        x2 = _AVX512_FMA(q2, h1, x2);
        q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
        x3 = _AVX512_FMA(q3, h1, x3);
        q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
        x4 = _AVX512_FMA(q4, h1, x4);


        /////////////////////////////////////////////////////
        // Rank-2 update of Q [24 x nb+1]
        /////////////////////////////////////////////////////

        __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
        __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
        __AVX512_DATATYPE vs = _AVX512_SET1(s);
340

Andreas Marek's avatar
Andreas Marek committed
341
#ifdef HAVE_AVX512_XEON_PHI
342
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
343
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
344 345
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
346
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
347 348
#endif

Andreas Marek's avatar
Andreas Marek committed
349 350 351 352
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau1, sign);
#endif
353 354 355 356 357 358
#endif
	x1 = _AVX512_MUL(x1, h1);
	x2 = _AVX512_MUL(x2, h1);
	x3 = _AVX512_MUL(x3, h1);
	x4 = _AVX512_MUL(x4, h1);

Andreas Marek's avatar
Andreas Marek committed
359
#ifdef HAVE_AVX512_XEON_PHI
360
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
361
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
362 363 364
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
Andreas Marek's avatar
Andreas Marek committed
365 366 367 368 369 370
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau2, sign);
#endif
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
#endif
	h2 = _AVX512_MUL(h1, vs);
	y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
	y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
	y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
	y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2));

	q1 = _AVX512_LOAD(q);
	q1 = _AVX512_ADD(q1, y1);
	_AVX512_STORE(q,q1);
	q2 = _AVX512_LOAD(&q[offset]);
	q2 = _AVX512_ADD(q2, y2);
	_AVX512_STORE(&q[offset],q2);
	q3 = _AVX512_LOAD(&q[2*offset]);
	q3 = _AVX512_ADD(q3, y3);
	_AVX512_STORE(&q[2*offset],q3);
	q4 = _AVX512_LOAD(&q[3*offset]);
	q4 = _AVX512_ADD(q4, y4);
	_AVX512_STORE(&q[3*offset],q4);

	h2 = _AVX512_SET1(hh[ldh+1]);

	q1 = _AVX512_LOAD(&q[ldq]);
	q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
	_AVX512_STORE(&q[ldq],q1);
	q2 = _AVX512_LOAD(&q[ldq+offset]);
	q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
	_AVX512_STORE(&q[ldq+offset],q2);
	q3 = _AVX512_LOAD(&q[ldq+2*offset]);
	q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
	_AVX512_STORE(&q[ldq+2*offset],q3);
	q4 = _AVX512_LOAD(&q[ldq+3*offset]);
	q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4));
	_AVX512_STORE(&q[ldq+3*offset],q4);

	for (i = 2; i < nb; i++)
	{
		h1 = _AVX512_SET1(hh[i-1]);
		h2 = _AVX512_SET1(hh[ldh+i]);

		q1 = _AVX512_LOAD(&q[i*ldq]);
		q1 = _AVX512_FMA(x1, h1, q1);
		q1 = _AVX512_FMA(y1, h2, q1);
		_AVX512_STORE(&q[i*ldq],q1);
		q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
		q2 = _AVX512_FMA(x2, h1, q2);
		q2 = _AVX512_FMA(y2, h2, q2);
		_AVX512_STORE(&q[(i*ldq)+offset],q2);
		q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
		q3 = _AVX512_FMA(x3, h1, q3);
		q3 = _AVX512_FMA(y3, h2, q3);
		_AVX512_STORE(&q[(i*ldq)+2*offset],q3);
		q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
		q4 = _AVX512_FMA(x4, h1, q4);
		q4 = _AVX512_FMA(y4, h2, q4);
		_AVX512_STORE(&q[(i*ldq)+3*offset],q4);

	}

	h1 = _AVX512_SET1(hh[nb-1]);

	q1 = _AVX512_LOAD(&q[nb*ldq]);
	q1 = _AVX512_FMA(x1, h1, q1);
	_AVX512_STORE(&q[nb*ldq],q1);
	q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
	q2 = _AVX512_FMA(x2, h1, q2);
	_AVX512_STORE(&q[(nb*ldq)+offset],q2);
	q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
	q3 = _AVX512_FMA(x3, h1, q3);
	_AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
	q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
	q4 = _AVX512_FMA(x4, h1, q4);
	_AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
444
>>>>>>> Skylake
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466

}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 24 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 48 rows of Q simultaneously, a
#endif

 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
467 468 469 470 471 472
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [24 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
        // Needed bit mask for floating point sign flip
473 474 475 476 477 478
#ifdef DOUBLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
Andreas Marek's avatar
Andreas Marek committed
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
        __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
        __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
        __AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);

        __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h2;

        __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
        __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
        __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
        __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
        __AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
        __AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);

        for(i = 2; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-1]);
                h2 = _AVX512_SET1(hh[ldh+i]);

                q1 = _AVX512_LOAD(&q[i*ldq]);
                x1 = _AVX512_FMA(q1, h1, x1);
                y1 = _AVX512_FMA(q1, h2, y1);
                q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
                x2 = _AVX512_FMA(q2, h1, x2);
                y2 = _AVX512_FMA(q2, h2, y2);
                q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
                x3 = _AVX512_FMA(q3, h1, x3);
                y3 = _AVX512_FMA(q3, h2, y3);
        }

        h1 = _AVX512_SET1(hh[nb-1]);

        q1 = _AVX512_LOAD(&q[nb*ldq]);
        x1 = _AVX512_FMA(q1, h1, x1);
        q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
        x2 = _AVX512_FMA(q2, h1, x2);
        q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
        x3 = _AVX512_FMA(q3, h1, x3);

        /////////////////////////////////////////////////////
        // Rank-2 update of Q [24 x nb+1]
        /////////////////////////////////////////////////////

        __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
        __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
        __AVX512_DATATYPE vs = _AVX512_SET1(s);
525

Andreas Marek's avatar
Andreas Marek committed
526
#ifdef HAVE_AVX512_XEON_PHI
527
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
528
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
529 530
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
531
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
532
#endif
Andreas Marek's avatar
Andreas Marek committed
533 534 535 536 537 538 539
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau1, sign);
#endif
#endif

540 541 542 543
	x1 = _AVX512_MUL(x1, h1);
	x2 = _AVX512_MUL(x2, h1);
	x3 = _AVX512_MUL(x3, h1);

Andreas Marek's avatar
Andreas Marek committed
544
#ifdef HAVE_AVX512_XEON_PHI
545
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
546
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
547 548 549 550
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
551 552 553 554 555 556 557
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau2, sign);
#endif
#endif

558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
	h2 = _AVX512_MUL(h1, vs);
	y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
	y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
	y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));

	q1 = _AVX512_LOAD(q);
	q1 = _AVX512_ADD(q1, y1);
	_AVX512_STORE(q,q1);
	q2 = _AVX512_LOAD(&q[offset]);
	q2 = _AVX512_ADD(q2, y2);
	_AVX512_STORE(&q[offset],q2);
	q3 = _AVX512_LOAD(&q[2*offset]);
	q3 = _AVX512_ADD(q3, y3);
	_AVX512_STORE(&q[2*offset],q3);

	h2 = _AVX512_SET1(hh[ldh+1]);

	q1 = _AVX512_LOAD(&q[ldq]);
	q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
	_AVX512_STORE(&q[ldq],q1);
	q2 = _AVX512_LOAD(&q[ldq+offset]);
	q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
	_AVX512_STORE(&q[ldq+offset],q2);
	q3 = _AVX512_LOAD(&q[ldq+2*offset]);
	q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
	_AVX512_STORE(&q[ldq+2*offset],q3);

	for (i = 2; i < nb; i++)
	{
		h1 = _AVX512_SET1(hh[i-1]);
		h2 = _AVX512_SET1(hh[ldh+i]);

		q1 = _AVX512_LOAD(&q[i*ldq]);
		q1 = _AVX512_FMA(x1, h1, q1);
		q1 = _AVX512_FMA(y1, h2, q1);
		_AVX512_STORE(&q[i*ldq],q1);
		q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
		q2 = _AVX512_FMA(x2, h1, q2);
		q2 = _AVX512_FMA(y2, h2, q2);
		_AVX512_STORE(&q[(i*ldq)+offset],q2);
		q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
		q3 = _AVX512_FMA(x3, h1, q3);
		q3 = _AVX512_FMA(y3, h2, q3);
		_AVX512_STORE(&q[(i*ldq)+2*offset],q3);

	}

	h1 = _AVX512_SET1(hh[nb-1]);

	q1 = _AVX512_LOAD(&q[nb*ldq]);
	q1 = _AVX512_FMA(x1, h1, q1);
	_AVX512_STORE(&q[nb*ldq],q1);
	q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
	q2 = _AVX512_FMA(x2, h1, q2);
	_AVX512_STORE(&q[(nb*ldq)+offset],q2);
	q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
	q3 = _AVX512_FMA(x3, h1, q3);
	_AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
616
>>>>>>> Skylake
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637

}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 32 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
638 639 640 641 642 643
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [16 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
        // Needed bit mask for floating point sign flip
644 645 646 647 648 649
#ifdef DOUBLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
Andreas Marek's avatar
Andreas Marek committed
650 651
        __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
        __AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
652

Andreas Marek's avatar
Andreas Marek committed
653 654
        __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h2;
655

Andreas Marek's avatar
Andreas Marek committed
656 657 658 659
        __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
        __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
        __AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
        __AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
660

Andreas Marek's avatar
Andreas Marek committed
661 662 663 664
        for(i = 2; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-1]);
                h2 = _AVX512_SET1(hh[ldh+i]);
665

Andreas Marek's avatar
Andreas Marek committed
666 667 668 669 670 671 672
                q1 = _AVX512_LOAD(&q[i*ldq]);
                x1 = _AVX512_FMA(q1, h1, x1);
                y1 = _AVX512_FMA(q1, h2, y1);
                q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
                x2 = _AVX512_FMA(q2, h1, x2);
                y2 = _AVX512_FMA(q2, h2, y2);
        }
673

Andreas Marek's avatar
Andreas Marek committed
674
        h1 = _AVX512_SET1(hh[nb-1]);
675

Andreas Marek's avatar
Andreas Marek committed
676 677 678 679
        q1 = _AVX512_LOAD(&q[nb*ldq]);
        x1 = _AVX512_FMA(q1, h1, x1);
        q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
        x2 = _AVX512_FMA(q2, h1, x2);
680

Andreas Marek's avatar
Andreas Marek committed
681 682 683
        /////////////////////////////////////////////////////
        // Rank-2 update of Q [16 x nb+1]
        /////////////////////////////////////////////////////
684 685 686 687

	__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
	__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
	__AVX512_DATATYPE vs = _AVX512_SET1(s);
Andreas Marek's avatar
Andreas Marek committed
688
#ifdef HAVE_AVX512_XEON_PHI
689
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
690
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
691 692
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
693
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
694
#endif
Andreas Marek's avatar
Andreas Marek committed
695 696 697 698 699
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau1, sign);
#endif
700 701 702
#endif
	x1 = _AVX512_MUL(x1, h1);
	x2 = _AVX512_MUL(x2, h1);
Andreas Marek's avatar
Andreas Marek committed
703
#ifdef HAVE_AVX512_XEON_PHI
704
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
705
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
706 707 708 709
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
710 711 712 713 714
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau2, sign);
#endif
Andreas Marek's avatar
Andreas Marek committed
715 716
#endif

717
	h2 = _AVX512_MUL(h1, vs);
Andreas Marek's avatar
Andreas Marek committed
718

719 720
	y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
	y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
Andreas Marek's avatar
Andreas Marek committed
721

722 723 724 725 726 727
	q1 = _AVX512_LOAD(q);
	q1 = _AVX512_ADD(q1, y1);
	_AVX512_STORE(q,q1);
	q2 = _AVX512_LOAD(&q[offset]);
	q2 = _AVX512_ADD(q2, y2);
	_AVX512_STORE(&q[offset],q2);
Andreas Marek's avatar
Andreas Marek committed
728

729
	h2 = _AVX512_SET1(hh[ldh+1]);
Andreas Marek's avatar
Andreas Marek committed
730

731 732 733 734 735 736
	q1 = _AVX512_LOAD(&q[ldq]);
	q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
	_AVX512_STORE(&q[ldq],q1);
	q2 = _AVX512_LOAD(&q[ldq+offset]);
	q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
	_AVX512_STORE(&q[ldq+offset],q2);
Andreas Marek's avatar
Andreas Marek committed
737

738 739 740 741
	for (i = 2; i < nb; i++)
	{
		h1 = _AVX512_SET1(hh[i-1]);
		h2 = _AVX512_SET1(hh[ldh+i]);
Andreas Marek's avatar
Andreas Marek committed
742

743 744 745 746 747 748 749 750 751
		q1 = _AVX512_LOAD(&q[i*ldq]);
		q1 = _AVX512_FMA(x1, h1, q1);
		q1 = _AVX512_FMA(y1, h2, q1);
		_AVX512_STORE(&q[i*ldq],q1);
		q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
		q2 = _AVX512_FMA(x2, h1, q2);
		q2 = _AVX512_FMA(y2, h2, q2);
		_AVX512_STORE(&q[(i*ldq)+offset],q2);
	}
Andreas Marek's avatar
Andreas Marek committed
752

753 754 755 756 757 758 759 760
	h1 = _AVX512_SET1(hh[nb-1]);

	q1 = _AVX512_LOAD(&q[nb*ldq]);
	q1 = _AVX512_FMA(x1, h1, q1);
	_AVX512_STORE(&q[nb*ldq],q1);
	q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
	q2 = _AVX512_FMA(x2, h1, q2);
	_AVX512_STORE(&q[(nb*ldq)+offset],q2);
761
>>>>>>> Skylake
762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782

}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
783 784 785 786 787 788
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [8 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
        // Needed bit mask for floating point sign flip
789 790 791 792 793 794
#ifdef DOUBLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
        __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
Andreas Marek's avatar
Andreas Marek committed
795
        __AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
796

Andreas Marek's avatar
Andreas Marek committed
797 798
        __AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
        __AVX512_DATATYPE h2;
799

Andreas Marek's avatar
Andreas Marek committed
800 801
        __AVX512_DATATYPE q1 = _AVX512_LOAD(q);
        __AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
802

Andreas Marek's avatar
Andreas Marek committed
803 804 805 806
        for(i = 2; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-1]);
                h2 = _AVX512_SET1(hh[ldh+i]);
807

Andreas Marek's avatar
Andreas Marek committed
808 809 810 811
                q1 = _AVX512_LOAD(&q[i*ldq]);
                x1 = _AVX512_FMA(q1, h1, x1);
                y1 = _AVX512_FMA(q1, h2, y1);
        }
812

Andreas Marek's avatar
Andreas Marek committed
813
        h1 = _AVX512_SET1(hh[nb-1]);
814

Andreas Marek's avatar
Andreas Marek committed
815 816
        q1 = _AVX512_LOAD(&q[nb*ldq]);
        x1 = _AVX512_FMA(q1, h1, x1);
817

Andreas Marek's avatar
Andreas Marek committed
818 819 820
        /////////////////////////////////////////////////////
        // Rank-2 update of Q [8 x nb+1]
        /////////////////////////////////////////////////////
821 822 823 824

	__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
	__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
	__AVX512_DATATYPE vs = _AVX512_SET1(s);
Andreas Marek's avatar
Andreas Marek committed
825
#ifdef HAVE_AVX512_XEON_PHI
826
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
827
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
828 829
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
830
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
831
#endif
Andreas Marek's avatar
Andreas Marek committed
832 833 834 835 836
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau1, sign);
#endif
837 838
#endif

Andreas Marek's avatar
Andreas Marek committed
839 840
	x1 = _AVX512_MUL(x1, h1);
#ifdef HAVE_AVX512_XEON_PHI
841
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
842
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
843 844
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
845
        h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
Andreas Marek's avatar
Andreas Marek committed
846 847 848 849 850 851
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
	h1 = _AVX512_XOR(tau2, sign);
#endif
852 853
#endif

Andreas Marek's avatar
Andreas Marek committed
854
        h2 = _AVX512_MUL(h1, vs);
855

Andreas Marek's avatar
Andreas Marek committed
856
        y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
857

Andreas Marek's avatar
Andreas Marek committed
858 859 860
        q1 = _AVX512_LOAD(q);
        q1 = _AVX512_ADD(q1, y1);
        _AVX512_STORE(q,q1);
861

Andreas Marek's avatar
Andreas Marek committed
862
        h2 = _AVX512_SET1(hh[ldh+1]);
863

Andreas Marek's avatar
Andreas Marek committed
864 865 866
        q1 = _AVX512_LOAD(&q[ldq]);
        q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
        _AVX512_STORE(&q[ldq],q1);
867

Andreas Marek's avatar
Andreas Marek committed
868 869 870 871
        for (i = 2; i < nb; i++)
        {
                h1 = _AVX512_SET1(hh[i-1]);
                h2 = _AVX512_SET1(hh[ldh+i]);
872

Andreas Marek's avatar
Andreas Marek committed
873 874 875 876 877
                q1 = _AVX512_LOAD(&q[i*ldq]);
                q1 = _AVX512_FMA(x1, h1, q1);
                q1 = _AVX512_FMA(y1, h2, q1);
                _AVX512_STORE(&q[i*ldq],q1);
        }
878

Andreas Marek's avatar
Andreas Marek committed
879
        h1 = _AVX512_SET1(hh[nb-1]);
880

Andreas Marek's avatar
Andreas Marek committed
881 882 883
        q1 = _AVX512_LOAD(&q[nb*ldq]);
        q1 = _AVX512_FMA(x1, h1, q1);
        _AVX512_STORE(&q[nb*ldq],q1);
884 885 886

}