Single precision AVX/AVX2 BLOCK2 kernel

parent b1fe112f
...@@ -97,43 +97,73 @@ endif ...@@ -97,43 +97,73 @@ endif
endif endif
if WITH_REAL_SSE_BLOCK2_KERNEL if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv_single_precision.c
endif
endif endif
if WITH_REAL_AVX_BLOCK2_KERNEL if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
endif
endif endif
if WITH_REAL_SSE_BLOCK4_KERNEL if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv_single_precision.c
endif
endif endif
if WITH_REAL_AVX_BLOCK4_KERNEL if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
endif
endif endif
if WITH_REAL_SSE_BLOCK6_KERNEL if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv_single_precision.c
endif
endif endif
if WITH_REAL_AVX_BLOCK6_KERNEL if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
endif
endif endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp
endif
endif endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp
endif
endif endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp
endif
endif endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp
endif
endif endif
.cu.lo: .cu.lo:
......
...@@ -304,6 +304,7 @@ contains ...@@ -304,6 +304,7 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else else
print *,"At the moment single precision only works with the generic kernels" print *,"At the moment single precision only works with the generic kernels"
...@@ -655,6 +656,7 @@ contains ...@@ -655,6 +656,7 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else else
print *,"At the moment single precision only works with the generic kernels" print *,"At the moment single precision only works with the generic kernels"
......
...@@ -85,12 +85,12 @@ ...@@ -85,12 +85,12 @@
extern "C" { extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0 #if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
std::complex<double> x0; std::complex<double> x0;
std::complex<double> x1; std::complex<double> x1;
...@@ -139,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* ...@@ -139,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
} }
#endif // if 0 #endif // if 0
void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -149,19 +149,19 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex ...@@ -149,19 +149,19 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex
for (i = 0; i < nq-8; i+=12) for (i = 0; i < nq-8; i+=12)
{ {
hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_12_AVX_1hv_double(&q[i], hh, nb, ldq);
} }
if (nq-i > 4) if (nq-i > 4)
{ {
hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_8_AVX_1hv_double(&q[i], hh, nb, ldq);
} }
else if (nq-i > 0) else if (nq-i > 0)
{ {
hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_4_AVX_1hv_double(&q[i], hh, nb, ldq);
} }
} }
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -356,7 +356,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex ...@@ -356,7 +356,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex
} }
} }
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -501,7 +501,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double> ...@@ -501,7 +501,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
......
...@@ -85,13 +85,13 @@ ...@@ -85,13 +85,13 @@
extern "C" { extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0 #if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
std::complex<double> x1; std::complex<double> x1;
std::complex<double> x2; std::complex<double> x2;
...@@ -188,7 +188,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* ...@@ -188,7 +188,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>*
} }
#endif #endif
void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -205,29 +205,29 @@ void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex ...@@ -205,29 +205,29 @@ void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex
#if 1 #if 1
for (i = 0; i < nq-4; i+=8) for (i = 0; i < nq-4; i+=8)
{ {
hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
if (nq-i > 0) if (nq-i > 0)
{ {
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
#else #else
for (i = 0; i < nq-4; i+=6) for (i = 0; i < nq-4; i+=6)
{ {
hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
if (nq-i > 2) if (nq-i > 2)
{ {
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
else if (nq-i > 0) else if (nq-i > 0)
{ {
hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
#endif #endif
} }
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -660,7 +660,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double> ...@@ -660,7 +660,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
} }
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -1013,7 +1013,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double> ...@@ -1013,7 +1013,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
} }
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -1286,7 +1286,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double> ...@@ -1286,7 +1286,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
} }
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
......
...@@ -75,12 +75,12 @@ ...@@ -75,12 +75,12 @@
extern "C" { extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0 #if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
std::complex<double> x0; std::complex<double> x0;
std::complex<double> x1; std::complex<double> x1;
...@@ -129,7 +129,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* ...@@ -129,7 +129,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
} }
#endif // if 0 #endif // if 0
void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_sse_1hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -139,19 +139,19 @@ void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<doub ...@@ -139,19 +139,19 @@ void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<doub
for (i = 0; i < nq-4; i+=6) for (i = 0; i < nq-4; i+=6)
{ {
hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_6_SSE_1hv_double(&q[i], hh, nb, ldq);
} }
if (nq-i > 2) if (nq-i > 2)
{ {
hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_4_SSE_1hv_double(&q[i], hh, nb, ldq);
} }
else if (nq-i > 0) else if (nq-i > 0)
{ {
hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_2_SSE_1hv_double(&q[i], hh, nb, ldq);
} }
} }
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -346,7 +346,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double> ...@@ -346,7 +346,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -491,7 +491,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double> ...@@ -491,7 +491,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
......
...@@ -73,13 +73,13 @@ ...@@ -73,13 +73,13 @@
extern "C" { extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0 #if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
std::complex<double> x1; std::complex<double> x1;
std::complex<double> x2; std::complex<double> x2;
...@@ -176,7 +176,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* ...@@ -176,7 +176,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>*
} }
#endif #endif
void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_sse_2hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -193,25 +193,25 @@ void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<doub ...@@ -193,25 +193,25 @@ void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<doub
#if 1 #if 1
for (i = 0; i < nq; i+=4) for (i = 0; i < nq; i+=4)
{ {
hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
#else #else
for (i = 0; i < nq-2; i+=3) for (i = 0; i < nq-2; i+=3)
{ {
hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_3_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
if (nq-i > 1) if (nq-i > 1)
{ {
hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
else if (nq-i > 0) else if (nq-i > 0)
{ {
hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_1_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
} }
#endif #endif
} }
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -644,7 +644,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double> ...@@ -644,7 +644,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
} }
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -997,7 +997,7 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double> ...@@ -997,7 +997,7 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
} }
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -1270,7 +1270,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double> ...@@ -1270,7 +1270,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
} }