Commit e476f8b7 authored by Andreas Marek's avatar Andreas Marek

Smaller step sizes in comlex AVX512 BLock 2 kernel

parent 154f4923
......@@ -48,6 +48,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#define __forceinline __attribute__((always_inline))
......@@ -104,12 +105,16 @@
//Forward declaration
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s);
#endif
/*
......@@ -154,6 +159,9 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex*
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
#ifdef DOUBLE_PRECISION_COMPLEX
double complex s = conj(hh[(ldh)+1])*1.0;
......@@ -167,27 +175,69 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex*
}
#ifdef DOUBLE_PRECISION_COMPLEX
for (i = 0; i < nq-8; i+=16)
for (i = 0; i < nq-12; i+=16)
{
hh_trafo_complex_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
for (i = 0; i < nq-16; i+=32)
for (i = 0; i < nq-24; i+=32)
{
hh_trafo_complex_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
if (nq-i == 0) {
return;
} else {
}
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 12 ) {
hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 24 ) {
hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
}
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 8 ) {
hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 16 ) {
hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 4 ) {
hh_trafo_complex_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 8 ) {
hh_trafo_complex_kernel_8_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
if (worked_on != nq)
{
printf("Error in complex AVX512 BLOCK 2 kernel \n");
}
}
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -551,11 +601,12 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
_AVX512_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
}
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
static __forceinline void hh_trafo_complex_kernel_12_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{
......@@ -569,12 +620,11 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
#endif
__AVX512_DATATYPE x1, x2;
__AVX512_DATATYPE y1, y2;
__AVX512_DATATYPE q1, q2;
__AVX512_DATATYPE x1, x2, x3, x4;
__AVX512_DATATYPE y1, y2, y3, y4;
__AVX512_DATATYPE q1, q2, q3, q4;
__AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__AVX512_DATATYPE tmp1, tmp2;
__AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -584,14 +634,16 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);
x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]); // q1, q2, q3, q4
x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]); // q5, q6, q7, q8
x3 = _AVX512_LOAD(&q_dbl[(2*ldq)+2*offset]); // q9, q10, q11, q12
h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
y1 = _AVX512_LOAD(&q_dbl[0]);
y2 = _AVX512_LOAD(&q_dbl[offset]);
y3 = _AVX512_LOAD(&q_dbl[2*offset]);
tmp1 = _AVX512_MUL(h2_imag, x1);
......@@ -601,10 +653,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h2_imag, x3);
y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
for (i = 2; i < nb; i++)
{
q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
......@@ -617,6 +674,10 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h1_imag, q3);
x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
......@@ -627,6 +688,11 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
tmp2 = _AVX512_MUL(h2_imag, q2);
y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h2_imag, q3);
y3 = _AVX512_ADD(y3, _AVX512_FMSUBADD(h2_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
......@@ -634,6 +700,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
tmp1 = _AVX512_MUL(h1_imag, q1);
......@@ -643,6 +710,10 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h1_imag, q3);
x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
......@@ -654,8 +725,6 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -664,6 +733,10 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
tmp3 = _AVX512_MUL(h1_imag, x3);
x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
......@@ -696,7 +769,6 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
#endif
tmp1 = _AVX512_MUL(h2_imag, tmp2);
tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -714,6 +786,10 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
tmp3 = _AVX512_MUL(h1_imag, y3);
y3 = _AVX512_FMADDSUB(h1_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE));
tmp1 = _AVX512_MUL(h2_imag, x1);
y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
......@@ -722,23 +798,32 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h2_imag, x3);
y3 = _AVX512_ADD(y3, _AVX512_FMADDSUB(h2_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q1 = _AVX512_LOAD(&q_dbl[0]);
q2 = _AVX512_LOAD(&q_dbl[offset]);
q3 = _AVX512_LOAD(&q_dbl[2*offset]);
q1 = _AVX512_ADD(q1, y1);
q2 = _AVX512_ADD(q2, y2);
q3 = _AVX512_ADD(q3, y3);
_AVX512_STORE(&q_dbl[0], q1);
_AVX512_STORE(&q_dbl[offset], q2);
_AVX512_STORE(&q_dbl[2*offset], q3);
h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
q3 = _AVX512_LOAD(&q_dbl[(ldq*2)+2*offset]);
q1 = _AVX512_ADD(q1, x1);
q2 = _AVX512_ADD(q2, x2);
q3 = _AVX512_ADD(q3, x3);
tmp1 = _AVX512_MUL(h2_imag, y1);
......@@ -748,13 +833,19 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h2_imag, y3);
q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
_AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
_AVX512_STORE(&q_dbl[(ldq*2)+2*offset], q3);
for (i = 2; i < nb; i++)
{
q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
......@@ -767,6 +858,10 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h1_imag, x3);
q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
......@@ -778,14 +873,21 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h2_imag, y3);
q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h2_real, y3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
_AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
_AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
}
h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
q3 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -795,7 +897,453 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
tmp3 = _AVX512_MUL(h1_imag, x3);
q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
_AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
_AVX512_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
}
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
#endif
__AVX512_DATATYPE x1, x2;
__AVX512_DATATYPE y1, y2;
__AVX512_DATATYPE q1, q2;
__AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__AVX512_DATATYPE tmp1, tmp2;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
x2 = _AVX512_LOAD(&q_dbl[(2*ldq)+offset]);
h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
y1 = _AVX512_LOAD(&q_dbl[0]);
y2 = _AVX512_LOAD(&q_dbl[offset]);
tmp1 = _AVX512_MUL(h2_imag, x1);
y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h2_imag, x2);
y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
for (i = 2; i < nb; i++)
{
q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
tmp1 = _AVX512_MUL(h1_imag, q1);
x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h1_imag, q2);
x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
tmp1 = _AVX512_MUL(h2_imag, q1);
y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h2_imag, q2);
y2 = _AVX512_ADD(y2, _AVX512_FMSUBADD(h2_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
tmp1 = _AVX512_MUL(h1_imag, q1);
x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h1_imag, q2);
x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp2 = _AVX512_MUL(h1_imag, x2);
x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
h1_real = _AVX512_SET1(hh_dbl[ldh*2]);
h1_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
#endif
tmp1 = _AVX512_MUL(h2_imag, tmp2);
tmp2 = _AVX512_FMADDSUB(h2_real, tmp2, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
_AVX512_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
h2_real = _AVX512_SET1(s_dbl[0]);
h2_imag = _AVX512_SET1(s_dbl[1]);
tmp1 = _AVX512_MUL(h1_imag, y1);
y1 = _AVX512_FMADDSUB(h1_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp2 = _AVX512_MUL(h1_imag, y2);
y2 = _AVX512_FMADDSUB(h1_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE));
tmp1 = _AVX512_MUL(h2_imag, x1);
y1 = _AVX512_ADD(y1, _AVX512_FMADDSUB(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h2_imag, x2);
y2 = _AVX512_ADD(y2, _AVX512_FMADDSUB(h2_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q1 = _AVX512_LOAD(&q_dbl[0]);
q2 = _AVX512_LOAD(&q_dbl[offset]);
q1 = _AVX512_ADD(q1, y1);
q2 = _AVX512_ADD(q2, y2);
_AVX512_STORE(&q_dbl[0], q1);
_AVX512_STORE(&q_dbl[offset], q2);
h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
q1 = _AVX512_LOAD(&q_dbl[(ldq*2)+0]);
q2 = _AVX512_LOAD(&q_dbl[(ldq*2)+offset]);
q1 = _AVX512_ADD(q1, x1);
q2 = _AVX512_ADD(q2, x2);
tmp1 = _AVX512_MUL(h2_imag, y1);
q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h2_imag, y2);
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(ldq*2)+0], q1);
_AVX512_STORE(&q_dbl[(ldq*2)+offset], q2);
for (i = 2; i < nb; i++)
{
q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);
tmp1 = _AVX512_MUL(h1_imag, x1);
q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h1_imag, x2);
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
h2_real = _AVX512_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+i)*2)+1]);
tmp1 = _AVX512_MUL(h2_imag, y1);
q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h2_real, y1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h2_imag, y2);
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h2_real, y2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1);
_AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2);
}
h1_real = _AVX512_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((nb-1)*2)+1]);
q1 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*nb*ldq)+offset]);
tmp1 = _AVX512_MUL(h1_imag, x1);
q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
tmp2 = _AVX512_MUL(h1_imag, x2);
q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
_AVX512_STORE(&q_dbl[(2*nb*ldq)+0], q1);
_AVX512_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
}
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
#endif
__AVX512_DATATYPE x1, x2;
__AVX512_DATATYPE y1, y2;
__AVX512_DATATYPE q1, q2;
__AVX512_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__AVX512_DATATYPE tmp1, tmp2;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
x1 = _AVX512_LOAD(&q_dbl[(2*ldq)+0]);
h2_real = _AVX512_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _AVX512_SET1(hh_dbl[((ldh+1)*2)+1]);
y1 = _AVX512_LOAD(&q_dbl[0]);
tmp1 = _AVX512_MUL(h2_imag, x1);
y1 = _AVX512_ADD(y1, _AVX512_FMSUBADD(h2_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
for (i = 2; i < nb; i++)
{
q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]);
h1_real = _AVX512_SET1(hh_dbl[(i-1)*2]);
h1_imag = _AVX512_SET1(hh_dbl[((i-1)*2)+1]);