Commit fc70acb2 authored by Andreas Marek's avatar Andreas Marek

Finish cleanup of complex kernels

parent e892f315
......@@ -5501,8 +5501,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == AVX_512 */
//#if (VEC_SET == AVX_512 && BLOCK == 1) || VEC_SET != AVX_512
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
)
......@@ -6061,305 +6059,3 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* BLOCK2 */
}
// #endif
#if 0
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{
printf("calling it \n" );
#ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
#endif
__SIMD_DATATYPE x1, x2;
__SIMD_DATATYPE q1, q2;
__SIMD_DATATYPE h1_real, h1_imag;
__SIMD_DATATYPE tmp1, tmp2;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
x1 = _SIMD_LOAD(&q_dbl[0]);
for (i = 1; i < nb; i++)
{
h1_real = _SIMD_SET1(hh_dbl[i*2]);
h1_imag = _SIMD_SET1(hh_dbl[(i*2)+1]);
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
tmp1 = _SIMD_MUL(h1_imag, q1);
x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
}
h1_real = _SIMD_SET1(hh_dbl[0]);
h1_imag = _SIMD_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _SIMD_MUL(h1_imag, x1);
x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
q1 = _SIMD_LOAD(&q_dbl[0]);
q1 = _SIMD_ADD(q1, x1);
_SIMD_STORE(&q_dbl[0], q1);
for (i = 1; i < nb; i++)
{
h1_real = _SIMD_SET1(hh_dbl[i*2]);
h1_imag = _SIMD_SET1(hh_dbl[(i*2)+1]);
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
tmp1 = _SIMD_MUL(h1_imag, x1);
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
_SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
}
}
#endif
#if 0
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_AVX512_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
#endif
__SIMD_DATATYPE x1, x2;
__SIMD_DATATYPE y1, y2;
__SIMD_DATATYPE q1, q2;
__SIMD_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__SIMD_DATATYPE tmp1, tmp2;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
#endif
x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
y1 = _SIMD_LOAD(&q_dbl[0]);
tmp1 = _SIMD_MUL(h2_imag, x1);
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
for (i = 2; i < nb; i++)
{
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
h1_real = _SIMD_SET1(hh_dbl[(i-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((i-1)*2)+1]);
tmp1 = _SIMD_MUL(h1_imag, q1);
x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
tmp1 = _SIMD_MUL(h2_imag, q1);
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
}
h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _SIMD_MUL(h1_imag, q1);
x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
h1_real = _SIMD_SET1(hh_dbl[0]);
h1_imag = _SIMD_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _SIMD_MUL(h1_imag, x1);
x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
h2_real = _SIMD_XOR(h2_real, sign);
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = (__m512) _mm512_set1_pd(*(double*)(&s_dbl[0]));
#endif
tmp1 = _SIMD_MUL(h2_imag, tmp2);
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
_SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
h2_real = _SIMD_SET1(s_dbl[0]);
h2_imag = _SIMD_SET1(s_dbl[1]);
tmp1 = _SIMD_MUL(h1_imag, y1);
y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp1 = _SIMD_MUL(h2_imag, x1);
y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_LOAD(&q_dbl[0]);
q1 = _SIMD_ADD(q1, y1);
_SIMD_STORE(&q_dbl[0], q1);
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
q1 = _SIMD_ADD(q1, x1);
tmp1 = _SIMD_MUL(h2_imag, y1);
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
_SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
for (i = 2; i < nb; i++)
{
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
h1_real = _SIMD_SET1(hh_dbl[(i-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((i-1)*2)+1]);
tmp1 = _SIMD_MUL(h1_imag, x1);
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
tmp1 = _SIMD_MUL(h2_imag, y1);
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
_SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
}
h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _SIMD_MUL(h1_imag, x1);
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
_SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment