Commit e5f6f43b authored by Andreas Marek's avatar Andreas Marek

Single precision AVX/AVX2 BLOCK2 kernel

parent b1fe112f
......@@ -97,43 +97,73 @@ endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c
endif
endif
if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c
endif
endif
if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c
endif
endif
if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp
endif
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp
endif
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp
endif
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp
endif
endif
.cu.lo:
......
......@@ -304,6 +304,7 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
......@@ -655,6 +656,7 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
......
......@@ -85,12 +85,12 @@
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
std::complex<double> x0;
std::complex<double> x1;
......@@ -139,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
}
#endif // if 0
void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
......@@ -149,19 +149,19 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex
for (i = 0; i < nq-8; i+=12)
{
hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq);
}
if (nq-i > 4)
{
hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq);
}
}
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -356,7 +356,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex
}
}
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -501,7 +501,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>
}
}
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......
......@@ -85,13 +85,13 @@
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
std::complex<double> x2;
......@@ -188,7 +188,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>*
}
#endif
void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -205,29 +205,29 @@ void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex
#if 1
for (i = 0; i < nq-4; i+=8)
{
hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
if (nq-i > 0)
{
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq-4; i+=6)
{
hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
if (nq-i > 2)
{
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#endif
}
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -660,7 +660,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
}
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1013,7 +1013,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
}
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1286,7 +1286,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
}
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......
......@@ -75,12 +75,12 @@
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
std::complex<double> x0;
std::complex<double> x1;
......@@ -129,7 +129,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
}
#endif // if 0
void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
void single_hh_trafo_complex_sse_1hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
......@@ -139,19 +139,19 @@ void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<doub
for (i = 0; i < nq-4; i+=6)
{
hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_6_SSE_1hv_double(&q[i], hh, nb, ldq);
}
if (nq-i > 2)
{
hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_4_SSE_1hv_double(&q[i], hh, nb, ldq);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq);
hh_trafo_complex_kernel_2_SSE_1hv_double(&q[i], hh, nb, ldq);
}
}
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -346,7 +346,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>
}
}
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -491,7 +491,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>
}
}
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......
......@@ -73,13 +73,13 @@
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
std::complex<double> x2;
......@@ -176,7 +176,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>*
}
#endif
void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void double_hh_trafo_complex_sse_2hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -193,25 +193,25 @@ void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<doub
#if 1
for (i = 0; i < nq; i+=4)
{
hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq-2; i+=3)
{
hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_3_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
if (nq-i > 1)
{
hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_1_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#endif
}
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -644,7 +644,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
}
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -997,7 +997,7 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
}
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1270,7 +1270,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
}
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......
......@@ -106,7 +106,7 @@ module real_generic_kernel
! Calculate dot product of the two Householder vectors
s = hh(2,2)*1
s = hh(2,2)*1.0
do i=3,nb
s = s+hh(i,2)*hh(i-1,1)
enddo
......
......@@ -81,17 +81,17 @@
#endif
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void double_hh_trafo_real_avx_avx2_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#if 0
void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void double_hh_trafo_real_avx_avx2_2hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -112,7 +112,7 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn
// Production level kernel calls with padding
for (i = 0; i < nq-20; i+=24)
{
hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
if (nq == i)
......@@ -122,25 +122,25 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn
if (nq-i == 20)
{
hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i+16], hh, nb, ldq, ldh, s);
}
else if (nq-i == 16)
{
hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i == 12)
{
hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i+8], hh, nb, ldq, ldh, s);
}
else if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else
{
hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
}
......@@ -167,12 +167,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
#ifdef __AVX__
for (i = 0; i < nq; i+=24)
{
hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq; i+=12)
{
hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
#endif
}
......@@ -184,7 +184,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [24 x nb+1] * hh
......@@ -498,7 +498,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [16 x nb+1] * hh
......@@ -732,7 +732,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+1] * hh
......@@ -886,7 +886,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+1] * hh
......
......@@ -84,16 +84,16 @@
#endif
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void quad_hh_trafo_real_avx_avx2_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void quad_hh_trafo_real_avx_avx2_4hv_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -148,7 +148,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq,
#ifdef __AVX__
for (i = 0; i < nq-8; i+=12)
{
hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
if (nq == i)
{
......@@ -158,17 +158,17 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq,
{
if (nq-i > 4)
{
hh_trafo_kernel_8_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
else
{
hh_trafo_kernel_4_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
}
#else
for (i = 0; i < nq-4; i+=6)
{
hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
if (nq == i)
{
......@@ -178,18 +178,18 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq,
{
if (nq-i > 2)
{
hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
else
{
hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
}
#endif
}
#if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void quad_hh_trafo_fast_double_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -237,12 +237,12 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i
#ifdef __AVX__
for (i = 0; i < nq; i+=12)
{
hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#else
for (i = 0; i < nq; i+=6)
{
hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#endif
}
......@@ -254,7 +254,7 @@ void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, i
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+3] * hh
......@@ -782,7 +782,7 @@ __forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+3] * hh
......@@ -1137,7 +1137,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
__forceinline void hh_trafo_kernel_4_AVX_4hv