Commit 60720341 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to unfiy double/single complex AVX512 block1 kernel

parent b465f154
......@@ -596,13 +596,14 @@ EXTRA_DIST = \
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_4hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_6hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_sse_1hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_4hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_6hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_sse_1hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_sse_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_avx-avx2_1hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_avx-avx2_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_avx512_1hv_template.Xc \
src/elpa2/redist_band.X90 \
src/elpa2/pack_unpack_cpu.X90 \
src/elpa2/pack_unpack_gpu.X90 \
......
......@@ -42,414 +42,14 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
#endif
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx512_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
//int ldh = *pldh;
for (i = 0; i < nq-16; i+=24)
{
hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq);
}
if (nq == i)
{
return;
}
if (nq-i == 16)
{
hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq);
}
else
{
hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq);
}
}
static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
__m512d x1, x2, x3, x4, x5, x6;
__m512d q1, q2, q3, q4, q5, q6;
__m512d h1_real, h1_imag;
__m512d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
int i=0;
__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
x1 = _mm512_load_pd(&q_dbl[0]); // complex 1, 2, 3, 4
x2 = _mm512_load_pd(&q_dbl[8]); // complex 5, 6, 7, 8
x3 = _mm512_load_pd(&q_dbl[16]); // complex 9, 10, 11, 12
x4 = _mm512_load_pd(&q_dbl[24]); // complex 13, 14, 15, 16
x5 = _mm512_load_pd(&q_dbl[32]); // complex 17, 18, 19, 20
x6 = _mm512_load_pd(&q_dbl[40]); // complex 21, 22, 23, 24
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
q5 = _mm512_load_pd(&q_dbl[(2*i*ldq)+32]);
q6 = _mm512_load_pd(&q_dbl[(2*i*ldq)+40]);
tmp1 = _mm512_mul_pd(h1_imag, q1);
x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, q2);
x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
tmp3 = _mm512_mul_pd(h1_imag, q3);
x3 = _mm512_add_pd(x3, _mm512_FMSUBADD_pd(h1_real, q3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
tmp4 = _mm512_mul_pd(h1_imag, q4);
x4 = _mm512_add_pd(x4, _mm512_FMSUBADD_pd(h1_real, q4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
tmp5 = _mm512_mul_pd(h1_imag, q5);
x5 = _mm512_add_pd(x5, _mm512_FMSUBADD_pd(h1_real, q5, _mm512_shuffle_pd(tmp5, tmp5, 0x55)));
tmp6 = _mm512_mul_pd(h1_imag, q6);
x6 = _mm512_add_pd(x6, _mm512_FMSUBADD_pd(h1_real, q6, _mm512_shuffle_pd(tmp6, tmp6, 0x55)));
}
h1_real = _mm512_set1_pd(hh_dbl[0]);
h1_imag = _mm512_set1_pd(hh_dbl[1]);
// h1_real = _mm512_xor_pd(h1_real, sign);
// h1_imag = _mm512_xor_pd(h1_imag, sign);
h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
tmp1 = _mm512_mul_pd(h1_imag, x1);
x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
tmp2 = _mm512_mul_pd(h1_imag, x2);
x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
tmp3 = _mm512_mul_pd(h1_imag, x3);
x3 = _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55));
tmp4 = _mm512_mul_pd(h1_imag, x4);
x4 = _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55));
tmp5 = _mm512_mul_pd(h1_imag, x5);
x5 = _mm512_FMADDSUB_pd(h1_real, x5, _mm512_shuffle_pd(tmp5, tmp5, 0x55));
tmp6 = _mm512_mul_pd(h1_imag, x6);
x6 = _mm512_FMADDSUB_pd(h1_real, x6, _mm512_shuffle_pd(tmp6, tmp6, 0x55));
q1 = _mm512_load_pd(&q_dbl[0]);
q2 = _mm512_load_pd(&q_dbl[8]);
q3 = _mm512_load_pd(&q_dbl[16]);
q4 = _mm512_load_pd(&q_dbl[24]);
q5 = _mm512_load_pd(&q_dbl[32]);
q6 = _mm512_load_pd(&q_dbl[40]);
q1 = _mm512_add_pd(q1, x1);
q2 = _mm512_add_pd(q2, x2);
q3 = _mm512_add_pd(q3, x3);
q4 = _mm512_add_pd(q4, x4);
q5 = _mm512_add_pd(q5, x5);
q6 = _mm512_add_pd(q6, x6);
_mm512_store_pd(&q_dbl[0], q1);
_mm512_store_pd(&q_dbl[8], q2);
_mm512_store_pd(&q_dbl[16], q3);
_mm512_store_pd(&q_dbl[24], q4);
_mm512_store_pd(&q_dbl[32], q5);
_mm512_store_pd(&q_dbl[40], q6);
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
q5 = _mm512_load_pd(&q_dbl[(2*i*ldq)+32]);
q6 = _mm512_load_pd(&q_dbl[(2*i*ldq)+40]);
tmp1 = _mm512_mul_pd(h1_imag, x1);
q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, x2);
q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
tmp3 = _mm512_mul_pd(h1_imag, x3);
q3 = _mm512_add_pd(q3, _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
tmp4 = _mm512_mul_pd(h1_imag, x4);
q4 = _mm512_add_pd(q4, _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
tmp5 = _mm512_mul_pd(h1_imag, x5);
q5 = _mm512_add_pd(q5, _mm512_FMADDSUB_pd(h1_real, x5, _mm512_shuffle_pd(tmp5, tmp5, 0x55)));
tmp6 = _mm512_mul_pd(h1_imag, x6);
q6 = _mm512_add_pd(q6, _mm512_FMADDSUB_pd(h1_real, x6, _mm512_shuffle_pd(tmp6, tmp6, 0x55)));
_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
_mm512_store_pd(&q_dbl[(2*i*ldq)+16], q3);
_mm512_store_pd(&q_dbl[(2*i*ldq)+24], q4);
_mm512_store_pd(&q_dbl[(2*i*ldq)+32], q5);
_mm512_store_pd(&q_dbl[(2*i*ldq)+40], q6);
}
}
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
__m512d x1, x2, x3, x4;
__m512d q1, q2, q3, q4;
__m512d h1_real, h1_imag;
__m512d tmp1, tmp2, tmp3, tmp4;
int i=0;
__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
x1 = _mm512_load_pd(&q_dbl[0]); // complex 1 2 3 4
x2 = _mm512_load_pd(&q_dbl[8]);
x3 = _mm512_load_pd(&q_dbl[16]);
x4 = _mm512_load_pd(&q_dbl[24]); // comlex 13 14 15 16
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
tmp1 = _mm512_mul_pd(h1_imag, q1);
x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, q2);
x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
tmp3 = _mm512_mul_pd(h1_imag, q3);
x3 = _mm512_add_pd(x3, _mm512_FMSUBADD_pd(h1_real, q3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
tmp4 = _mm512_mul_pd(h1_imag, q4);
x4 = _mm512_add_pd(x4, _mm512_FMSUBADD_pd(h1_real, q4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
}
h1_real = _mm512_set1_pd(hh_dbl[0]);
h1_imag = _mm512_set1_pd(hh_dbl[1]);
// h1_real = _mm512_xor_pd(h1_real, sign);
// h1_imag = _mm512_xor_pd(h1_imag, sign);
h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
tmp1 = _mm512_mul_pd(h1_imag, x1);
x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
tmp2 = _mm512_mul_pd(h1_imag, x2);
x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
tmp3 = _mm512_mul_pd(h1_imag, x3);
x3 = _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55));
tmp4 = _mm512_mul_pd(h1_imag, x4);
x4 = _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55));
q1 = _mm512_load_pd(&q_dbl[0]);
q2 = _mm512_load_pd(&q_dbl[8]);
q3 = _mm512_load_pd(&q_dbl[16]);
q4 = _mm512_load_pd(&q_dbl[24]);
q1 = _mm512_add_pd(q1, x1);
q2 = _mm512_add_pd(q2, x2);
q3 = _mm512_add_pd(q3, x3);
q4 = _mm512_add_pd(q4, x4);
_mm512_store_pd(&q_dbl[0], q1);
_mm512_store_pd(&q_dbl[8], q2);
_mm512_store_pd(&q_dbl[16], q3);
_mm512_store_pd(&q_dbl[24], q4);
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
q3 = _mm512_load_pd(&q_dbl[(2*i*ldq)+16]);
q4 = _mm512_load_pd(&q_dbl[(2*i*ldq)+24]);
tmp1 = _mm512_mul_pd(h1_imag, x1);
q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, x2);
q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
tmp3 = _mm512_mul_pd(h1_imag, x3);
q3 = _mm512_add_pd(q3, _mm512_FMADDSUB_pd(h1_real, x3, _mm512_shuffle_pd(tmp3, tmp3, 0x55)));
tmp4 = _mm512_mul_pd(h1_imag, x4);
q4 = _mm512_add_pd(q4, _mm512_FMADDSUB_pd(h1_real, x4, _mm512_shuffle_pd(tmp4, tmp4, 0x55)));
_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
_mm512_store_pd(&q_dbl[(2*i*ldq)+16], q3);
_mm512_store_pd(&q_dbl[(2*i*ldq)+24], q4);
}
}
static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
__m512d x1, x2;
__m512d q1, q2;
__m512d h1_real, h1_imag;
__m512d tmp1, tmp2;
int i=0;
__m512d sign = (__m512d)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
x1 = _mm512_load_pd(&q_dbl[0]);
x2 = _mm512_load_pd(&q_dbl[8]);
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
tmp1 = _mm512_mul_pd(h1_imag, q1);
x1 = _mm512_add_pd(x1, _mm512_FMSUBADD_pd(h1_real, q1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, q2);
x2 = _mm512_add_pd(x2, _mm512_FMSUBADD_pd(h1_real, q2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
}
h1_real = _mm512_set1_pd(hh_dbl[0]);
h1_imag = _mm512_set1_pd(hh_dbl[1]);
// h1_real = _mm512_xor_pd(h1_real, sign);
// h1_imag = _mm512_xor_pd(h1_imag, sign);
h1_real = (__m512d) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__m512d) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
tmp1 = _mm512_mul_pd(h1_imag, x1);
x1 = _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55));
tmp2 = _mm512_mul_pd(h1_imag, x2);
x2 = _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55));
q1 = _mm512_load_pd(&q_dbl[0]);
q2 = _mm512_load_pd(&q_dbl[8]);
q1 = _mm512_add_pd(q1, x1);
q2 = _mm512_add_pd(q2, x2);
_mm512_store_pd(&q_dbl[0], q1);
_mm512_store_pd(&q_dbl[8], q2);
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_pd(hh_dbl[i*2]);
h1_imag = _mm512_set1_pd(hh_dbl[(i*2)+1]);
q1 = _mm512_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_pd(&q_dbl[(2*i*ldq)+8]);
tmp1 = _mm512_mul_pd(h1_imag, x1);
q1 = _mm512_add_pd(q1, _mm512_FMADDSUB_pd(h1_real, x1, _mm512_shuffle_pd(tmp1, tmp1, 0x55)));
tmp2 = _mm512_mul_pd(h1_imag, x2);
q2 = _mm512_add_pd(q2, _mm512_FMADDSUB_pd(h1_real, x2, _mm512_shuffle_pd(tmp2, tmp2, 0x55)));
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "elpa2_kernels_complex_avx512_1hv_template.Xc"
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
_mm512_store_pd(&q_dbl[(2*i*ldq)+0], q1);
_mm512_store_pd(&q_dbl[(2*i*ldq)+8], q2);
}
}
......@@ -42,411 +42,14 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
#endif
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx512_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
//int ldh = *pldh;
for (i = 0; i < nq-32; i+=48)
{
hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq);
}
if (nq == i)
{
return;
}
if (nq-i == 32)
{
hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq);
}
else
{
hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq);
}
}
static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
{
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
__m512 x1, x2, x3, x4, x5, x6;
__m512 q1, q2, q3, q4, q5, q6;
__m512 h1_real, h1_imag;
__m512 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
int i=0;
__m512 sign = (__m512)_mm512_set1_epi32(0x80000000);
x1 = _mm512_load_ps(&q_dbl[0]); // complex 1, 2, 3, 4, 5, 6, 7, 8
x2 = _mm512_load_ps(&q_dbl[16]); // complex 9, 10, 11, 12, 13, 14, 15, 16
x3 = _mm512_load_ps(&q_dbl[32]); // complex 17 ...24
x4 = _mm512_load_ps(&q_dbl[48]); // complex 25 .. 32
x5 = _mm512_load_ps(&q_dbl[64]); // complex 33 .. 40
x6 = _mm512_load_ps(&q_dbl[80]); // complex 40 .. 48
for (i = 1; i < nb; i++)
{
h1_real = _mm512_set1_ps(hh_dbl[i*2]);
h1_imag = _mm512_set1_ps(hh_dbl[(i*2)+1]);
q1 = _mm512_load_ps(&q_dbl[(2*i*ldq)+0]);
q2 = _mm512_load_ps(&q_dbl[(2*i*ldq)+16]);
q3 = _mm512_load_ps(&q_dbl[(2*i*ldq)+32]);
q4 = _mm512_load_ps(&q_dbl[(2*i*ldq)+48]);
q5 = _mm512_load_ps(&q_dbl[(2*i*ldq)+64]);
q6 = _mm512_load_ps(&q_dbl[(2*i*ldq)+80]);
tmp1 = _mm512_mul_ps(h1_imag, q1);
x1 = _mm512_add_ps(x1, _mm512_FMSUBADD_ps(h1_real, q1, _mm512_shuffle_ps(tmp1, tmp1, 0xb1)));
tmp2 = _mm512_mul_ps(h1_imag, q2);
x2 = _mm512_add_ps(x2, _mm512_FMSUBADD_ps(h1_real, q2, _mm512_shuffle_ps(tmp2, tmp2, 0xb1)));
tmp3 = _mm512_mul_ps(h1_imag, q3);
x3 = _mm512_add_ps(x3, _mm512_FMSUBADD_ps(h1_real, q3, _mm512_shuffle_ps(tmp3, tmp3, 0xb1)));
tmp4 = _mm512_mul_ps(h1_imag, q4);
x4 = _mm512_add_ps(x4, _mm512_FMSUBADD_ps(h1_real, q4, _mm512_shuffle_ps(tmp4, tmp4, 0xb1)));
tmp5 = _mm512_mul_ps(h1_imag, q5);
x5 = _mm512_add_ps(x5, _mm512_FMSUBADD_ps(h1_real, q5, _mm512_shuffle_ps(tmp5, tmp5, 0xb1)));
tmp6 = _mm512_mul_ps(h1_imag, q6);
x6 = _mm512_add_ps(x6, _mm512_FMSUBADD_ps(h1_real, q6, _mm512_shuffle_ps(tmp6, tmp6, 0xb1)));
}
h1_real = _mm512_set1_ps(hh_dbl[0]);
h1_imag = _mm512_set1_ps(hh_dbl[1]);
// h1_real = _mm512_xor_ps(h1_real, sign);
// h1_imag = _mm512_xor_ps(h1_imag, sign);
h1_real = (__m512) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__m512) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);