Commit 53986264 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl Committed by Andreas Marek
Browse files

Fix another alignment issue

There was again a case where stack-variables were loaded with
instructions that needed properly aligned memory. This only surfaced
with the Intel C compiler, where the stack layout evidently was
sufficiently different to trigger this.

This was also the case for SSE kernels
parent dfc670d2
...@@ -78,6 +78,7 @@ ...@@ -78,6 +78,7 @@
#define _AVX_ADDSUB _mm256_addsub_pd #define _AVX_ADDSUB _mm256_addsub_pd
#define _AVX_XOR _mm256_xor_pd #define _AVX_XOR _mm256_xor_pd
#define _AVX_BROADCAST _mm256_broadcast_sd #define _AVX_BROADCAST _mm256_broadcast_sd
#define _AVX_SET1 _mm256_set1_pd
#define _AVX_SHUFFLE _mm256_shuffle_pd #define _AVX_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5 #define _SHUFFLE 0x5
#define _CAST _mm256_castpd256_pd128 #define _CAST _mm256_castpd256_pd128
...@@ -113,6 +114,7 @@ ...@@ -113,6 +114,7 @@
#define _AVX_ADDSUB _mm256_addsub_ps #define _AVX_ADDSUB _mm256_addsub_ps
#define _AVX_XOR _mm256_xor_ps #define _AVX_XOR _mm256_xor_ps
#define _AVX_BROADCAST _mm256_broadcast_ss #define _AVX_BROADCAST _mm256_broadcast_ss
#define _AVX_SET1 _mm256_set1_ps
#define _AVX_SHUFFLE _mm256_shuffle_ps #define _AVX_SHUFFLE _mm256_shuffle_ps
#define _SHUFFLE 0xb1 #define _SHUFFLE 0xb1
#define _CAST _mm256_castps256_ps128 #define _CAST _mm256_castps256_ps128
...@@ -489,8 +491,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex ...@@ -489,8 +491,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); _AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_BROADCAST(&s_dbl[0]); h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_BROADCAST(&s_dbl[1]); h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -933,8 +935,8 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double comple ...@@ -933,8 +935,8 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double comple
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); _AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_BROADCAST(&s_dbl[0]); h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_BROADCAST(&s_dbl[1]); h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1298,8 +1300,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex ...@@ -1298,8 +1300,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); _AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_BROADCAST(&s_dbl[0]); h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_BROADCAST(&s_dbl[1]); h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1571,8 +1573,8 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double comple ...@@ -1571,8 +1573,8 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double comple
tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _AVX_ADDSUB( _AVX_MUL(h2_real, tmp2), _AVX_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_AVX_STOREU(s_dbl, _CAST(tmp2)); _AVX_STOREU(s_dbl, _CAST(tmp2));
h2_real = _AVX_BROADCAST(&s_dbl[0]); h2_real = _AVX_SET1(s_dbl[0]);
h2_imag = _AVX_BROADCAST(&s_dbl[1]); h2_imag = _AVX_SET1(s_dbl[1]);
tmp1 = _AVX_MUL(h1_imag, y1); tmp1 = _AVX_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
......
...@@ -487,12 +487,17 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex ...@@ -487,12 +487,17 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex
_SSE_STOREU(s_dbl, tmp2); _SSE_STOREU(s_dbl, tmp2);
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_real = _mm_set1_pd(s_dbl[0]);
h2_imag = _mm_loaddup_pd(&s_dbl[1]); h2_imag = _mm_set1_pd(s_dbl[1]);
// h2_real = _mm_loaddup_pd(&s_dbl[0]);
// h2_imag = _mm_loaddup_pd(&s_dbl[1]);
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) ))); h2_real = _mm_set1_ps(s_dbl[0]);
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) ))); h2_imag = _mm_set1_ps(s_dbl[1]);
// h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) )));
// h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) )));
#endif #endif
tmp1 = _SSE_MUL(h1_imag, y1); tmp1 = _SSE_MUL(h1_imag, y1);
...@@ -953,8 +958,11 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double comple ...@@ -953,8 +958,11 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double comple
tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_SSE_STOREU(s_dbl, tmp2); _SSE_STOREU(s_dbl, tmp2);
h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_real = _mm_set1_pd(s_dbl[0]);
h2_imag = _mm_loaddup_pd(&s_dbl[1]); h2_imag = _mm_set1_pd(s_dbl[1]);
// h2_real = _mm_loaddup_pd(&s_dbl[0]);
// h2_imag = _mm_loaddup_pd(&s_dbl[1]);
tmp1 = _SSE_MUL(h1_imag, y1); tmp1 = _SSE_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1275,8 +1283,11 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double comple ...@@ -1275,8 +1283,11 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double comple
tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_SSE_STOREU(s_dbl, tmp2); _SSE_STOREU(s_dbl, tmp2);
h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_real = _mm_set1_pd(s_dbl[0]);
h2_imag = _mm_loaddup_pd(&s_dbl[1]); h2_imag = _mm_set1_pd(s_dbl[1]);
// h2_real = _mm_loaddup_pd(&s_dbl[0]);
// h2_imag = _mm_loaddup_pd(&s_dbl[1]);
tmp1 = _SSE_MUL(h1_imag, y1); tmp1 = _SSE_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
...@@ -1514,8 +1525,11 @@ static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double comple ...@@ -1514,8 +1525,11 @@ static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double comple
tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)); tmp2 = _SSE_ADDSUB( _SSE_MUL(h2_real, tmp2), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif #endif
_SSE_STOREU(s_dbl, tmp2); _SSE_STOREU(s_dbl, tmp2);
h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_real = _mm_set1_pd(s_dbl[0]);
h2_imag = _mm_loaddup_pd(&s_dbl[1]); h2_imag = _mm_set1_pd(s_dbl[1]);
// h2_real = _mm_loaddup_pd(&s_dbl[0]);
// h2_imag = _mm_loaddup_pd(&s_dbl[1]);
tmp1 = _SSE_MUL(h1_imag, y1); tmp1 = _SSE_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment