Commit 92bfbd49 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Fix stack-overflow in kernels

The were a couple of places where a load of a stack variable was too
wide. Detected with the

  -fsanitize=address

flag of GCC, that we should probably incorporate into our CI tests.
parent c8b105a7
......@@ -72,8 +72,6 @@
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_LOADU _mm_loadu_pd
#define _AVX_STOREU _mm_storeu_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_MUL _mm256_mul_pd
......@@ -108,8 +106,6 @@
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_LOADU _mm_loadu_ps
#define _AVX_STOREU _mm_storeu_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_MUL _mm256_mul_ps
......@@ -503,12 +499,11 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple
h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_ps(&tmp_s_128);
tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
tmp1 = _AVX_MUL(h2_imag, tmp2);
......@@ -518,9 +513,8 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX_2hv_single(float comple
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
_AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_SET1(s_dbl[1]);
h2_real = _AVX_SET1(tmp2[0]);
h2_imag = _AVX_SET1(tmp2[1]);
tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
......@@ -934,23 +928,21 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX_2hv_single(float comple
h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_ps(&tmp_s_128);
tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
tmp1 = _AVX_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _AVX_FMADDSUB(h2_real, tmp2, _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
_AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_SET1(s_dbl[1]);
h2_real = _AVX_SET1(tmp2[0]);
h2_imag = _AVX_SET1(tmp2[1]);
tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
......@@ -1283,12 +1275,11 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex
h2_imag = _AVX_XOR(h2_imag, sign);
#ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl);
tmp2 = _mm256_broadcast_ps(&tmp_s_128);
tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
tmp1 = _AVX_MUL(h2_imag, tmp2);
......@@ -1297,9 +1288,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex
#else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
_AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_SET1(s_dbl[1]);
h2_real = _AVX_SET1(tmp2[0]);
h2_imag = _AVX_SET1(tmp2[1]);
tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
......@@ -1547,13 +1537,13 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
h2_real = _AVX_XOR(h2_real, sign);
h2_imag = _AVX_XOR(h2_imag, sign);
__AVX_DATATYPE tmp2;
#ifdef DOUBLE_PRECISION_COMPLEX
__m128d tmp_s_128 = _AVX_LOADU(s_dbl);
__AVX_DATATYPE tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__m128 tmp_s_128 = _AVX_LOADU(s_dbl);
__AVX_DATATYPE tmp2 = _mm256_broadcast_ps(&tmp_s_128);
tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
#endif
tmp1 = _AVX_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
......@@ -1561,9 +1551,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
#else
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
_AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_SET1(s_dbl[1]);
h2_real = _AVX_SET1(tmp2[0]);
h2_imag = _AVX_SET1(tmp2[1]);
tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
......
......@@ -63,6 +63,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <pmmintrin.h>
#define __forceinline __attribute__((always_inline))
......@@ -475,7 +476,11 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex
h2_real = _SSE_XOR(h2_real, sign);
h2_imag = _SSE_XOR(h2_imag, sign);
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
#else
tmp2 = _SSE_LOADU(s_dbl);
#endif
tmp1 = _SSE_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
......@@ -484,20 +489,13 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex
tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
_SSE_STOREU(s_dbl, tmp2);
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_set1_pd(s_dbl[0]);
h2_imag = _mm_set1_pd(s_dbl[1]);
// h2_real = _mm_loaddup_pd(&s_dbl[0]);
// h2_imag = _mm_loaddup_pd(&s_dbl[1]);
h2_real = _mm_movedup_pd(tmp2);
h2_imag = _mm_set1_pd(tmp2[1]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = _mm_set1_ps(s_dbl[0]);
h2_imag = _mm_set1_ps(s_dbl[1]);
// h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) )));
// h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) )));
h2_real = _mm_moveldup_ps(tmp2);
h2_imag = _mm_movehdup_ps(tmp2);
#endif
tmp1 = _SSE_MUL(h1_imag, y1);
......
......@@ -307,10 +307,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
__SSE_DATATYPE x6 = _SSE_LOAD(&q[ldq+5*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
__SSE_DATATYPE h2;
......@@ -329,12 +329,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
......@@ -359,10 +359,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
......@@ -381,15 +381,14 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
h1 = _SSE_XOR(tau1, sign);
......@@ -428,11 +427,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[5*offset],q6);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
q1 = _SSE_LOAD(&q[ldq]);
......@@ -457,15 +455,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[i-1]));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+i]));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
......@@ -488,11 +483,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[(i*ldq)+5*offset],q6);
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[nb-1]));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
......@@ -554,10 +548,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
__SSE_DATATYPE x5 = _SSE_LOAD(&q[ldq+4*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
__SSE_DATATYPE h2;
......@@ -574,12 +568,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
......@@ -601,10 +595,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
......@@ -621,14 +615,14 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
......@@ -663,11 +657,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[4*offset],q5);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
q1 = _SSE_LOAD(&q[ldq]);
......@@ -689,15 +682,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[i-1]));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+i]));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
......@@ -717,11 +707,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[(i*ldq)+4*offset],q5);
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[nb-1]));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
......@@ -777,10 +766,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
__SSE_DATATYPE x4 = _SSE_LOAD(&q[ldq+3*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
__SSE_DATATYPE h2;
......@@ -795,12 +784,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
......@@ -819,10 +808,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
......@@ -837,14 +826,14 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
......@@ -874,11 +863,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[3*offset],q4);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
q1 = _SSE_LOAD(&q[ldq]);
......@@ -897,15 +885,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[i-1]));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+i]));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
......@@ -922,11 +907,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[(i*ldq)+3*offset],q4);
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[nb-1]));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
......@@ -978,10 +962,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
__SSE_DATATYPE x3 = _SSE_LOAD(&q[ldq+2*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
__SSE_DATATYPE h2;
......@@ -994,12 +978,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
......@@ -1015,10 +999,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
h1 = _mm_set1_ps(hh[nb-1]);
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
......@@ -1031,14 +1015,14 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
......@@ -1063,11 +1047,10 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
_SSE_STORE(&q[2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
q1 = _SSE_LOAD(&q[ldq]);
......@@ -1083,15 +1066,12 @@ __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x8
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);