Unverified Commit 3328e7bb authored by Andreas Marek's avatar Andreas Marek
Browse files

Error in AVX2 single precision kernels

parent 69fd894b
......@@ -585,7 +585,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
h2 = _mm256_mul_ps(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(y1, h1, _mm256_mul_ps(x1,h2));
y2 = _mm256_FMA_ps(ys, h1, _mm256_mul_ps(x2,h2));
y2 = _mm256_FMA_ps(y2, h1, _mm256_mul_ps(x2,h2));
// y3 = _mm256_FMA_ps(y3, h1, _mm256_mul_ps(x3,h2));
// y4 = _mm256_FMA_ps(y4, h1, _mm256_mul_ps(x4,h2));
#else
......
......@@ -380,7 +380,7 @@ __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
y2 = _mm256_FMA_ps(q2, h2, y2);
/ y3 = _mm256_FMA_ps(q3, h2, y3);
// y3 = _mm256_FMA_ps(q3, h2, y3);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
......@@ -915,7 +915,7 @@ __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb
h3 = _mm256_mul_ps(h1, vs_2_3);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMSUB_ps(z1, h1, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)));
z2 = _mm256_FMSUB_ps(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_ps(x2,h2)));
z2 = _mm256_FMSUB_ps(z2, h1, _mm256_FMA_ps(y2, h3, _mm256_mul_ps(x2,h2)));
#else
z1 = _mm256_sub_ps(_mm256_mul_ps(z1,h1), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)));
z2 = _mm256_sub_ps(_mm256_mul_ps(z2,h1), _mm256_add_ps(_mm256_mul_ps(y2,h3), _mm256_mul_ps(x2,h2)));
......
......@@ -54,16 +54,16 @@
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub_ps(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif
#endif
......@@ -1511,7 +1511,7 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
#endif
h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_pd(z1, h3, q1);
q1 = _mm256_NFMA_ps(z1, h3, q1);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3));
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment