// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include
#include
#include
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define offset 8
#define __AVX512_DATATYPE __m512d
#define __AVX512i __m512i
#define _AVX512_LOAD _mm512_load_pd
#define _AVX512_STORE _mm512_store_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
#endif
#define _AVX512_FMA _mm512_FMA_pd
#endif /* DOUBLE_PRECISION_REAL */
#ifdef SINGLE_PRECISION_REAL
#define offset 16
#define __AVX512_DATATYPE __m512
#define __AVX512i __m512i
#define _AVX512_LOAD _mm512_load_ps
#define _AVX512_STORE _mm512_store_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
#endif
#define _AVX512_FMA _mm512_FMA_ps
#endif /* SINGLE_PRECISION_REAL */
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_real_avx512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx512_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_real_avx512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx512_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute
// 2 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double s = hh[(ldh)+1]*1.0;
#endif
#ifdef SINGLE_PRECISION_REAL
float s = hh[(ldh)+1]*1.0f;
#endif
#pragma ivdep
for (i = 2; i < nb; i++)
{
s += hh[i-1] * hh[(i+ldh)];
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-24; i+=32)
{
hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 32;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-48; i+=64)
{
hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 64;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 24)
{
hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 48)
{
hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 48;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 32)
{
hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 32;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in AVX512 real BLOCK 2 kernel \n");
abort();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 32 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 64 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [24 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
__AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
__AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
__AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
__AVX512_DATATYPE x4 = _AVX512_LOAD(&q[ldq+3*offset]);
__AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
__AVX512_DATATYPE h2;
__AVX512_DATATYPE q1 = _AVX512_LOAD(q);
__AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
__AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
__AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
__AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
__AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
__AVX512_DATATYPE q4 = _AVX512_LOAD(&q[3*offset]);
__AVX512_DATATYPE y4 = _AVX512_FMA(x4, h1, q4);
for(i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
y1 = _AVX512_FMA(q1, h2, y1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
y2 = _AVX512_FMA(q2, h2, y2);
q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX512_FMA(q3, h1, x3);
y3 = _AVX512_FMA(q3, h2, y3);
q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX512_FMA(q4, h1, x4);
y4 = _AVX512_FMA(q4, h2, y4);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX512_FMA(q3, h1, x3);
q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX512_FMA(q4, h1, x4);
/////////////////////////////////////////////////////
// Rank-2 update of Q [24 x nb+1]
/////////////////////////////////////////////////////
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
x4 = _AVX512_MUL(x4, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2));
q1 = _AVX512_LOAD(q);
q1 = _AVX512_ADD(q1, y1);
_AVX512_STORE(q,q1);
q2 = _AVX512_LOAD(&q[offset]);
q2 = _AVX512_ADD(q2, y2);
_AVX512_STORE(&q[offset],q2);
q3 = _AVX512_LOAD(&q[2*offset]);
q3 = _AVX512_ADD(q3, y3);
_AVX512_STORE(&q[2*offset],q3);
q4 = _AVX512_LOAD(&q[3*offset]);
q4 = _AVX512_ADD(q4, y4);
_AVX512_STORE(&q[3*offset],q4);
h2 = _AVX512_SET1(hh[ldh+1]);
q1 = _AVX512_LOAD(&q[ldq]);
q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
_AVX512_STORE(&q[ldq],q1);
q2 = _AVX512_LOAD(&q[ldq+offset]);
q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
_AVX512_STORE(&q[ldq+offset],q2);
q3 = _AVX512_LOAD(&q[ldq+2*offset]);
q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
_AVX512_STORE(&q[ldq+2*offset],q3);
q4 = _AVX512_LOAD(&q[ldq+3*offset]);
q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4));
_AVX512_STORE(&q[ldq+3*offset],q4);
for (i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
q1 = _AVX512_FMA(y1, h2, q1);
_AVX512_STORE(&q[i*ldq],q1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
q2 = _AVX512_FMA(y2, h2, q2);
_AVX512_STORE(&q[(i*ldq)+offset],q2);
q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
q3 = _AVX512_FMA(x3, h1, q3);
q3 = _AVX512_FMA(y3, h2, q3);
_AVX512_STORE(&q[(i*ldq)+2*offset],q3);
q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]);
q4 = _AVX512_FMA(x4, h1, q4);
q4 = _AVX512_FMA(y4, h2, q4);
_AVX512_STORE(&q[(i*ldq)+3*offset],q4);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
_AVX512_STORE(&q[nb*ldq],q1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
_AVX512_STORE(&q[(nb*ldq)+offset],q2);
q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX512_FMA(x3, h1, q3);
_AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX512_FMA(x4, h1, q4);
_AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
>>>>>>> Skylake
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 24 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 48 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [24 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
__AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
__AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
__AVX512_DATATYPE x3 = _AVX512_LOAD(&q[ldq+2*offset]);
__AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
__AVX512_DATATYPE h2;
__AVX512_DATATYPE q1 = _AVX512_LOAD(q);
__AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
__AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
__AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
__AVX512_DATATYPE q3 = _AVX512_LOAD(&q[2*offset]);
__AVX512_DATATYPE y3 = _AVX512_FMA(x3, h1, q3);
for(i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
y1 = _AVX512_FMA(q1, h2, y1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
y2 = _AVX512_FMA(q2, h2, y2);
q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX512_FMA(q3, h1, x3);
y3 = _AVX512_FMA(q3, h2, y3);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX512_FMA(q3, h1, x3);
/////////////////////////////////////////////////////
// Rank-2 update of Q [24 x nb+1]
/////////////////////////////////////////////////////
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2));
q1 = _AVX512_LOAD(q);
q1 = _AVX512_ADD(q1, y1);
_AVX512_STORE(q,q1);
q2 = _AVX512_LOAD(&q[offset]);
q2 = _AVX512_ADD(q2, y2);
_AVX512_STORE(&q[offset],q2);
q3 = _AVX512_LOAD(&q[2*offset]);
q3 = _AVX512_ADD(q3, y3);
_AVX512_STORE(&q[2*offset],q3);
h2 = _AVX512_SET1(hh[ldh+1]);
q1 = _AVX512_LOAD(&q[ldq]);
q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
_AVX512_STORE(&q[ldq],q1);
q2 = _AVX512_LOAD(&q[ldq+offset]);
q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
_AVX512_STORE(&q[ldq+offset],q2);
q3 = _AVX512_LOAD(&q[ldq+2*offset]);
q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3));
_AVX512_STORE(&q[ldq+2*offset],q3);
for (i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
q1 = _AVX512_FMA(y1, h2, q1);
_AVX512_STORE(&q[i*ldq],q1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
q2 = _AVX512_FMA(y2, h2, q2);
_AVX512_STORE(&q[(i*ldq)+offset],q2);
q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]);
q3 = _AVX512_FMA(x3, h1, q3);
q3 = _AVX512_FMA(y3, h2, q3);
_AVX512_STORE(&q[(i*ldq)+2*offset],q3);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
_AVX512_STORE(&q[nb*ldq],q1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
_AVX512_STORE(&q[(nb*ldq)+offset],q2);
q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX512_FMA(x3, h1, q3);
_AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
>>>>>>> Skylake
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 16 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 32 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [16 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
__AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
__AVX512_DATATYPE x2 = _AVX512_LOAD(&q[ldq+offset]);
__AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
__AVX512_DATATYPE h2;
__AVX512_DATATYPE q1 = _AVX512_LOAD(q);
__AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
__AVX512_DATATYPE q2 = _AVX512_LOAD(&q[offset]);
__AVX512_DATATYPE y2 = _AVX512_FMA(x2, h1, q2);
for(i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
y1 = _AVX512_FMA(q1, h2, y1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
y2 = _AVX512_FMA(q2, h2, y2);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX512_FMA(q2, h1, x2);
/////////////////////////////////////////////////////
// Rank-2 update of Q [16 x nb+1]
/////////////////////////////////////////////////////
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
q1 = _AVX512_LOAD(q);
q1 = _AVX512_ADD(q1, y1);
_AVX512_STORE(q,q1);
q2 = _AVX512_LOAD(&q[offset]);
q2 = _AVX512_ADD(q2, y2);
_AVX512_STORE(&q[offset],q2);
h2 = _AVX512_SET1(hh[ldh+1]);
q1 = _AVX512_LOAD(&q[ldq]);
q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
_AVX512_STORE(&q[ldq],q1);
q2 = _AVX512_LOAD(&q[ldq+offset]);
q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2));
_AVX512_STORE(&q[ldq+offset],q2);
for (i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
q1 = _AVX512_FMA(y1, h2, q1);
_AVX512_STORE(&q[i*ldq],q1);
q2 = _AVX512_LOAD(&q[(i*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
q2 = _AVX512_FMA(y2, h2, q2);
_AVX512_STORE(&q[(i*ldq)+offset],q2);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
_AVX512_STORE(&q[nb*ldq],q1);
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2);
_AVX512_STORE(&q[(nb*ldq)+offset],q2);
>>>>>>> Skylake
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 16 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
__AVX512_DATATYPE x1 = _AVX512_LOAD(&q[ldq]);
__AVX512_DATATYPE h1 = _AVX512_SET1(hh[ldh+1]);
__AVX512_DATATYPE h2;
__AVX512_DATATYPE q1 = _AVX512_LOAD(q);
__AVX512_DATATYPE y1 = _AVX512_FMA(x1, h1, q1);
for(i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
y1 = _AVX512_FMA(q1, h2, y1);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
x1 = _AVX512_FMA(q1, h1, x1);
/////////////////////////////////////////////////////
// Rank-2 update of Q [8 x nb+1]
/////////////////////////////////////////////////////
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
q1 = _AVX512_LOAD(q);
q1 = _AVX512_ADD(q1, y1);
_AVX512_STORE(q,q1);
h2 = _AVX512_SET1(hh[ldh+1]);
q1 = _AVX512_LOAD(&q[ldq]);
q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1));
_AVX512_STORE(&q[ldq],q1);
for (i = 2; i < nb; i++)
{
h1 = _AVX512_SET1(hh[i-1]);
h2 = _AVX512_SET1(hh[ldh+i]);
q1 = _AVX512_LOAD(&q[i*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
q1 = _AVX512_FMA(y1, h2, q1);
_AVX512_STORE(&q[i*ldq],q1);
}
h1 = _AVX512_SET1(hh[nb-1]);
q1 = _AVX512_LOAD(&q[nb*ldq]);
q1 = _AVX512_FMA(x1, h1, q1);
_AVX512_STORE(&q[nb*ldq],q1);
}