Commit 92bfbd49 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Fix stack-overflow in kernels

The were a couple of places where a load of a stack variable was too
wide. Detected with the

  -fsanitize=address

flag of GCC, that we should probably incorporate into our CI tests.
parent c8b105a7
...@@ -4,15 +4,15 @@ ...@@ -4,15 +4,15 @@
// consisting of the following organizations: // consisting of the following organizations:
// //
// - Max Planck Computing and Data Facility (MPCDF), formerly known as // - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik, // Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit // - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen , // Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and // and
// - IBM Deutschland GmbH // - IBM Deutschland GmbH
// //
// This particular source code file contains additions, changes and // This particular source code file contains additions, changes and
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
// GNU Lesser General Public License for more details. // GNU Lesser General Public License for more details.
// //
// You should have received a copy of the GNU Lesser General Public License // You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/> // along with ELPA. If not, see <http://www.gnu.org/licenses/>
// //
// ELPA reflects a substantial effort on the part of the original // ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the // ELPA consortium, and we ask you to respect the spirit of the
...@@ -72,8 +72,6 @@ ...@@ -72,8 +72,6 @@
#define offset 4 #define offset 4
#define __AVX_DATATYPE __m256d #define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd #define _AVX_LOAD _mm256_load_pd
#define _AVX_LOADU _mm_loadu_pd
#define _AVX_STOREU _mm_storeu_pd
#define _AVX_STORE _mm256_store_pd #define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd #define _AVX_ADD _mm256_add_pd
#define _AVX_MUL _mm256_mul_pd #define _AVX_MUL _mm256_mul_pd
...@@ -108,8 +106,6 @@ ...@@ -108,8 +106,6 @@
#define offset 8 #define offset 8
#define __AVX_DATATYPE __m256 #define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps #define _AVX_LOAD _mm256_load_ps
#define _AVX_LOADU _mm_loadu_ps
#define _AVX_STOREU _mm_storeu_ps
#define _AVX_STORE _mm256_store_ps #define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps #define _AVX_ADD _mm256_add_ps
#define _AVX_MUL _mm256_mul_ps #define _AVX_MUL _mm256_mul_ps
...@@ -159,12 +155,12 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex ...@@ -159,12 +155,12 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2) !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface !f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double") !f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*) !f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q !f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2) !f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine !f> end subroutine
!f> end interface !f> end interface
!f>#endif !f>#endif
...@@ -175,12 +171,12 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex ...@@ -175,12 +171,12 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2) !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface !f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single") !f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*) !f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q !f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2) !f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine !f> end subroutine
!f> end interface !f> end interface
!f>#endif !f>#endif
...@@ -230,48 +226,48 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex ...@@ -230,48 +226,48 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
} }
#endif #endif
if (nq-i == 0) { if (nq-i == 0) {
return; return;
} }
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 6) { if (nq-i == 6) {
hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_6_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 6; worked_on += 6;
} }
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 12) { if (nq-i == 12) {
hh_trafo_complex_kernel_12_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s); hh_trafo_complex_kernel_12_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
worked_on += 12; worked_on += 12;
} }
#endif #endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 4) { if (nq-i == 4) {
hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4; worked_on += 4;
} }
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 8) { if (nq-i == 8) {
hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s); hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
worked_on += 8; worked_on += 8;
} }
#endif #endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 2) { if (nq-i == 2) {
hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_2_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 2; worked_on += 2;
} }
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 4) { if (nq-i == 4) {
hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s); hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
worked_on += 4; worked_on += 4;
} }
#endif #endif
if (worked_on != nq) { if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 2 kernel \n"); printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
abort(); abort();
} }
} }
...@@ -356,7 +352,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple ...@@ -356,7 +352,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple
q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]); q3 = _AVX_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]); q4 = _AVX_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]); h1_real = _AVX_BROADCAST(&hh_dbl[(i-1)*2]);
h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]); h1_imag = _AVX_BROADCAST(&hh_dbl[((i-1)*2)+1]);
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
// conjugate // conjugate
...@@ -503,12 +499,11 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple ...@@ -503,12 +499,11 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple
h2_imag = _AVX_XOR(h2_imag, sign); h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
tmp2 = _mm256_broadcast_ps(&tmp_s_128); s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif #endif
tmp1 = _AVX_MUL(h2_imag, tmp2); tmp1 = _AVX_MUL(h2_imag, tmp2);
...@@ -518,9 +513,8 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple ...@@ -518,9 +513,8 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); h2_real = _AVX_SET1(tmp2[0]);
h2_real = _AVX_SET1(s_dbl[0]); h2_imag = _AVX_SET1(tmp2[1]);
h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -934,23 +928,21 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float comple ...@@ -934,23 +928,21 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float comple
h2_imag = _AVX_XOR(h2_imag, sign); h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
tmp2 = _mm256_broadcast_ps(&tmp_s_128); s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif #endif
tmp1 = _AVX_MUL(h2_imag, tmp2); tmp1 = _AVX_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else #else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); h2_real = _AVX_SET1(tmp2[0]);
h2_real = _AVX_SET1(s_dbl[0]); h2_imag = _AVX_SET1(tmp2[1]);
h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1283,12 +1275,11 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex ...@@ -1283,12 +1275,11 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex
h2_imag = _AVX_XOR(h2_imag, sign); h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
tmp2 = _mm256_broadcast_ps(&tmp_s_128); s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif #endif
tmp1 = _AVX_MUL(h2_imag, tmp2); tmp1 = _AVX_MUL(h2_imag, tmp2);
...@@ -1297,9 +1288,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex ...@@ -1297,9 +1288,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex
#else #else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); h2_real = _AVX_SET1(tmp2[0]);
h2_real = _AVX_SET1(s_dbl[0]); h2_imag = _AVX_SET1(tmp2[1]);
h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1547,13 +1537,13 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex ...@@ -1547,13 +1537,13 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
h2_real = _AVX_XOR(h2_real, sign); h2_real = _AVX_XOR(h2_real, sign);
h2_imag = _AVX_XOR(h2_imag, sign); h2_imag = _AVX_XOR(h2_imag, sign);
__AVX_DATATYPE tmp2;
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
__AVX_DATATYPE tmp2 = _mm256_broadcast_pd(&tmp_s_128);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl); tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
__AVX_DATATYPE tmp2 = _mm256_broadcast_ps(&tmp_s_128); s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif #endif
tmp1 = _AVX_MUL(h2_imag, tmp2); tmp1 = _AVX_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1561,9 +1551,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex ...@@ -1561,9 +1551,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
#else #else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); h2_real = _AVX_SET1(tmp2[0]);
h2_real = _AVX_SET1(s_dbl[0]); h2_imag = _AVX_SET1(tmp2[1]);
h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
#include <complex.h> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <pmmintrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -155,31 +156,31 @@ void double_hh_trafo_complex_sse_2hv_double(double complex* q, double complex* h ...@@ -155,31 +156,31 @@ void double_hh_trafo_complex_sse_2hv_double(double complex* q, double complex* h
void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif #endif
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
double complex s = conj(hh[(ldh)+1])*1.0; double complex s = conj(hh[(ldh)+1])*1.0;
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
float complex s = conj(hh[(ldh)+1])*1.0f; float complex s = conj(hh[(ldh)+1])*1.0f;
#endif #endif
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
s += hh[i-1] * conj(hh[(i+ldh)]); s += hh[i-1] * conj(hh[(i+ldh)]);
} }
for (i = 0; i < nq; i+=4) for (i = 0; i < nq; i+=4)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s); hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
#endif #endif
} }
} }
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
...@@ -189,590 +190,587 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex ...@@ -189,590 +190,587 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex
#endif #endif
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s); double* s_dbl = (double*)(&s);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
float* q_dbl = (float*)q; float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh; float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s); float* s_dbl = (float*)(&s);
#endif #endif
__SSE_DATATYPE x1, x2, x3, x4; __SSE_DATATYPE x1, x2, x3, x4;
__SSE_DATATYPE y1, y2, y3, y4; __SSE_DATATYPE y1, y2, y3, y4;
__SSE_DATATYPE q1, q2, q3, q4; __SSE_DATATYPE q1, q2, q3, q4;
__SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag; __SSE_DATATYPE h1_real, h1_imag, h2_real, h2_imag;
__SSE_DATATYPE tmp1, tmp2, tmp3, tmp4; __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4;
int i=0; int i=0;
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000); __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif #endif
x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]); x1 = _SSE_LOAD(&q_dbl[(2*ldq)+0]);
x2 = _SSE_LOAD(&q_dbl[(2*ldq)+offset]); x2 = _SSE_LOAD(&q_dbl[(2*ldq)+offset]);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
x3 = _SSE_LOAD(&q_dbl[(2*ldq)+2*offset]); x3 = _SSE_LOAD(&q_dbl[(2*ldq)+2*offset]);
x4 = _SSE_LOAD(&q_dbl[(2*ldq)+3*offset]); x4 = _SSE_LOAD(&q_dbl[(2*ldq)+3*offset]);
#endif #endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) ))); h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) ))); h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
#endif #endif
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
// conjugate // conjugate
h2_imag = _SSE_XOR(h2_imag, sign); h2_imag = _SSE_XOR(h2_imag, sign);
#endif #endif
y1 = _SSE_LOAD(&q_dbl[0]); y1 = _SSE_LOAD(&q_dbl[0]);
y2 = _SSE_LOAD(&q_dbl[offset]); y2 = _SSE_LOAD(&q_dbl[offset]);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
y3 = _SSE_LOAD(&q_dbl[2*offset]); y3 = _SSE_LOAD(&q_dbl[2*offset]);
y4 = _SSE_LOAD(&q_dbl[3*offset]); y4 = _SSE_LOAD(&q_dbl[3*offset]);
#endif #endif
tmp1 = _SSE_MUL(h2_imag, x1); tmp1 = _SSE_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE))); y1 = _SSE_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else #else
y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE))); y1 = _SSE_ADD(y1, _SSE_ADDSUB( _SSE_MUL(h2_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif #endif
tmp2 = _SSE_MUL(h2_imag, x2); tmp2 = _SSE_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE))); y2 = _SSE_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else #else
y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE))); y2 = _SSE_ADD(y2, _SSE_ADDSUB( _SSE_MUL(h2_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif #endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
tmp3 = _SSE_MUL(h2_imag, x3); tmp3 = _SSE_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE))); y3 = _SSE_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else #else
y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE))); y3 = _SSE_ADD(y3, _SSE_ADDSUB( _SSE_MUL(h2_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif #endif
tmp4 = _SSE_MUL(h2_imag, x4); tmp4 = _SSE_MUL(h2_imag, x4);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE))); y4 = _SSE_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else #else
y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE))); y4 = _SSE_ADD(y4, _SSE_ADDSUB( _SSE_MUL(h2_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif