Commit dfd91ff8 authored by Andreas Marek's avatar Andreas Marek
Browse files

Debug AVX block2

parent d1aa0eb4
......@@ -4502,6 +4502,10 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
}
#if VEC_SET == SSE_128 || (VEC_SET == AVX_256 && BLOCK == 1)
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 1
......@@ -4950,4 +4954,220 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
}
#endif
#if VEC_SET == AVX_256 && BLOCK == 2
#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_2_AVX_AVX2_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_AVX_AVX2_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
#endif
__AVX_DATATYPE x1;
__AVX_DATATYPE y1;
__AVX_DATATYPE q1;
__AVX_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__AVX_DATATYPE tmp1;
int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX
__AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__AVX_DATATYPE sign = (__AVX_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif
x1 = _AVX_LOAD(&q_dbl[(2*ldq)+0]);
h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _AVX_XOR(h2_imag, sign);
#endif
y1 = _AVX_LOAD(&q_dbl[0]);
tmp1 = _AVX_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
for (i = 2; i < nb; i++)
{
q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _AVX_XOR(h1_imag, sign);
#endif
tmp1 = _AVX_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _AVX_XOR(h2_imag, sign);
#endif
tmp1 = _AVX_MUL(h2_imag, q1);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_ADD(y1, _AVX_FMSUBADD(h2_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
}
h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _AVX_XOR(h1_imag, sign);
#endif
q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _AVX_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _AVX_ADD(x1, _AVX_FMSUBADD(h1_real, q1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
x1 = _AVX_ADD(x1, _AVX_ADDSUB( _AVX_MUL(h1_real, q1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
h1_real = _AVX_BROADCAST(&hh_dbl[0]);
h1_imag = _AVX_BROADCAST(&hh_dbl[1]);
h1_real = _AVX_XOR(h1_real, sign);
h1_imag = _AVX_XOR(h1_imag, sign);
tmp1 = _AVX_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
x1 = _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
x1 = _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
h1_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
h2_real = _AVX_BROADCAST(&hh_dbl[ldh*2]);
h2_imag = _AVX_BROADCAST(&hh_dbl[(ldh*2)+1]);
h1_real = _AVX_XOR(h1_real, sign);
h1_imag = _AVX_XOR(h1_imag, sign);
h2_real = _AVX_XOR(h2_real, sign);
h2_imag = _AVX_XOR(h2_imag, sign);
__AVX_DATATYPE tmp2;
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
tmp1 = _AVX_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
h2_real = _AVX_SET1(tmp2[0]);
h2_imag = _AVX_SET1(tmp2[1]);
tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_FMADDSUB(h1_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
y1 = _AVX_ADDSUB( _AVX_MUL(h1_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
tmp1 = _AVX_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_ADD(y1, _AVX_FMADDSUB(h2_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _AVX_ADD(y1, _AVX_ADDSUB( _AVX_MUL(h2_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
q1 = _AVX_LOAD(&q_dbl[0]);
q1 = _AVX_ADD(q1, y1);
_AVX_STORE(&q_dbl[0], q1);
h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+1)*2]);
h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
q1 = _AVX_LOAD(&q_dbl[(ldq*2)+0]);
q1 = _AVX_ADD(q1, x1);
tmp1 = _AVX_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
_AVX_STORE(&q_dbl[(ldq*2)+0], q1);
for (i = 2; i < nb; i++)
{
q1 = _AVX_LOAD(&q_dbl[(2*i*ldq)+0]);
h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
tmp1 = _AVX_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
h2_real = _AVX_BROADCAST(&hh_dbl[(ldh+i)*2]);
h2_imag = _AVX_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
tmp1 = _AVX_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h2_real, y1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h2_real, y1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
_AVX_STORE(&q_dbl[(2*i*ldq)+0], q1);
}
h1_real = _AVX_BROADCAST(&hh_dbl[(nb-1)*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
q1 = _AVX_LOAD(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _AVX_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_ADD(q1, _AVX_FMADDSUB(h1_real, x1, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _AVX_ADD(q1, _AVX_ADDSUB( _AVX_MUL(h1_real, x1), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
_AVX_STORE(&q_dbl[(2*nb*ldq)+0], q1);
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment