From 9ef8709f9387437b12be159fc2b9fcc51e67b78f Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Fri, 8 Apr 2016 15:18:29 +0200 Subject: [PATCH] Remove FMA references in SSE kernels, the are not used anyway --- .../elpa2_kernels_real_sse_2hv.c | 284 +----- .../elpa2_kernels_real_sse_4hv.c | 461 ++------- .../elpa2_kernels_real_sse_6hv.c | 874 +++++------------- 3 files changed, 338 insertions(+), 1281 deletions(-) diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c index 0e37b132..acd891d6 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c @@ -181,20 +181,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; -#ifdef __ELPA_USE_FMA__ - __m128d q1 = _mm_load_pd(q); - __m128d y1 = _mm_macc_pd(x1, h1, q1); - __m128d q2 = _mm_load_pd(&q[2]); - __m128d y2 = _mm_macc_pd(x2, h1, q2); - __m128d q3 = _mm_load_pd(&q[4]); - __m128d y3 = _mm_macc_pd(x3, h1, q3); - __m128d q4 = _mm_load_pd(&q[6]); - __m128d y4 = _mm_macc_pd(x4, h1, q4); - __m128d q5 = _mm_load_pd(&q[8]); - __m128d y5 = _mm_macc_pd(x5, h1, q5); - __m128d q6 = _mm_load_pd(&q[10]); - __m128d y6 = _mm_macc_pd(x6, h1, q6); -#else __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); @@ -207,31 +193,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, __m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); __m128d q6 = _mm_load_pd(&q[10]); __m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); -#endif + for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); - y2 = _mm_macc_pd(q2, h2, y2); - q3 = _mm_load_pd(&q[(i*ldq)+4]); - x3 = _mm_macc_pd(q3, h1, x3); - y3 = _mm_macc_pd(q3, h2, y3); - q4 = _mm_load_pd(&q[(i*ldq)+6]); - x4 = _mm_macc_pd(q4, h1, x4); - y4 = _mm_macc_pd(q4, h2, y4); - q5 = _mm_load_pd(&q[(i*ldq)+8]); - x5 = _mm_macc_pd(q5, h1, x5); - y5 = _mm_macc_pd(q5, h2, y5); - q6 = _mm_load_pd(&q[(i*ldq)+10]); - x6 = _mm_macc_pd(q6, h1, x6); - y6 = _mm_macc_pd(q6, h2, y6); -#else + q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); @@ -250,24 +217,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, q6 = _mm_load_pd(&q[(i*ldq)+10]); x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); - q3 = _mm_load_pd(&q[(nb*ldq)+4]); - x3 = _mm_macc_pd(q3, h1, x3); - q4 = _mm_load_pd(&q[(nb*ldq)+6]); - x4 = _mm_macc_pd(q4, h1, x4); - q5 = _mm_load_pd(&q[(nb*ldq)+8]); - x5 = _mm_macc_pd(q5, h1, x5); - q6 = _mm_load_pd(&q[(nb*ldq)+10]); - x6 = _mm_macc_pd(q6, h1, x6); -#else + q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); @@ -280,7 +233,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); q6 = _mm_load_pd(&q[(nb*ldq)+10]); x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); -#endif ///////////////////////////////////////////////////// // Rank-2 update of Q [12 x nb+1] @@ -299,21 +251,13 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, x6 = _mm_mul_pd(x6, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); - y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); - y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2)); - y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2)); - y5 = _mm_macc_pd(y5, h1, _mm_mul_pd(x5,h2)); - y6 = _mm_macc_pd(y6, h1, _mm_mul_pd(x6,h2)); -#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2)); y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2)); -#endif q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); @@ -335,26 +279,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, _mm_store_pd(&q[10],q6); h2 = _mm_loaddup_pd(&hh[ldh+1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); - _mm_store_pd(&q[ldq],q1); - q2 = _mm_load_pd(&q[ldq+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); - _mm_store_pd(&q[ldq+2],q2); - q3 = _mm_load_pd(&q[ldq+4]); - q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3)); - _mm_store_pd(&q[ldq+4],q3); - q4 = _mm_load_pd(&q[ldq+6]); - q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4)); - _mm_store_pd(&q[ldq+6],q4); - q5 = _mm_load_pd(&q[ldq+8]); - q5 = _mm_add_pd(q5, _mm_macc_pd(y5, h2, x5)); - _mm_store_pd(&q[ldq+8],q5); - q6 = _mm_load_pd(&q[ldq+10]); - q6 = _mm_add_pd(q6, _mm_macc_pd(y6, h2, x6)); - _mm_store_pd(&q[ldq+10],q6); -#else + q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); @@ -373,32 +298,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, q6 = _mm_load_pd(&q[ldq+10]); q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2))); _mm_store_pd(&q[ldq+10],q6); -#endif + for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); - _mm_store_pd(&q[i*ldq],q1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); - _mm_store_pd(&q[(i*ldq)+2],q2); - q3 = _mm_load_pd(&q[(i*ldq)+4]); - q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2))); - _mm_store_pd(&q[(i*ldq)+4],q3); - q4 = _mm_load_pd(&q[(i*ldq)+6]); - q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2))); - _mm_store_pd(&q[(i*ldq)+6],q4); - q5 = _mm_load_pd(&q[(i*ldq)+8]); - q5 = _mm_add_pd(q5, _mm_macc_pd(x5, h1, _mm_mul_pd(y5, h2))); - _mm_store_pd(&q[(i*ldq)+8],q5); - q6 = _mm_load_pd(&q[(i*ldq)+10]); - q6 = _mm_add_pd(q6, _mm_macc_pd(x6, h1, _mm_mul_pd(y6, h2))); - _mm_store_pd(&q[(i*ldq)+10],q6); -#else q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); @@ -417,30 +322,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, q6 = _mm_load_pd(&q[(i*ldq)+10]); q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2))); _mm_store_pd(&q[(i*ldq)+10],q6); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - q1 = _mm_macc_pd(x1, h1, q1); - _mm_store_pd(&q[nb*ldq],q1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - q2 = _mm_macc_pd(x2, h1, q2); - _mm_store_pd(&q[(nb*ldq)+2],q2); - q3 = _mm_load_pd(&q[(nb*ldq)+4]); - q3 = _mm_macc_pd(x3, h1, q3); - _mm_store_pd(&q[(nb*ldq)+4],q3); - q4 = _mm_load_pd(&q[(nb*ldq)+6]); - q4 = _mm_macc_pd(x4, h1, q4); - _mm_store_pd(&q[(nb*ldq)+6],q4); - q5 = _mm_load_pd(&q[(nb*ldq)+8]); - q5 = _mm_macc_pd(x5, h1, q5); - _mm_store_pd(&q[(nb*ldq)+8],q5); - q6 = _mm_load_pd(&q[(nb*ldq)+10]); - q6 = _mm_macc_pd(x6, h1, q6); - _mm_store_pd(&q[(nb*ldq)+10],q6); -#else + q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); @@ -459,7 +344,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, q6 = _mm_load_pd(&q[(nb*ldq)+10]); q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); _mm_store_pd(&q[(nb*ldq)+10],q6); -#endif } /** @@ -487,16 +371,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; -#ifdef __ELPA_USE_FMA__ - __m128d q1 = _mm_load_pd(q); - __m128d y1 = _mm_macc_pd(x1, h1, q1); - __m128d q2 = _mm_load_pd(&q[2]); - __m128d y2 = _mm_macc_pd(x2, h1, q2); - __m128d q3 = _mm_load_pd(&q[4]); - __m128d y3 = _mm_macc_pd(x3, h1, q3); - __m128d q4 = _mm_load_pd(&q[6]); - __m128d y4 = _mm_macc_pd(x4, h1, q4); -#else __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); @@ -505,26 +379,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); __m128d q4 = _mm_load_pd(&q[6]); __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); -#endif for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); - y2 = _mm_macc_pd(q2, h2, y2); - q3 = _mm_load_pd(&q[(i*ldq)+4]); - x3 = _mm_macc_pd(q3, h1, x3); - y3 = _mm_macc_pd(q3, h2, y3); - q4 = _mm_load_pd(&q[(i*ldq)+6]); - x4 = _mm_macc_pd(q4, h1, x4); - y4 = _mm_macc_pd(q4, h2, y4); -#else + q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); @@ -537,20 +397,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int q4 = _mm_load_pd(&q[(i*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); - q3 = _mm_load_pd(&q[(nb*ldq)+4]); - x3 = _mm_macc_pd(q3, h1, x3); - q4 = _mm_load_pd(&q[(nb*ldq)+6]); - x4 = _mm_macc_pd(q4, h1, x4); -#else + q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); @@ -559,7 +409,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); q4 = _mm_load_pd(&q[(nb*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); -#endif ///////////////////////////////////////////////////// // Rank-2 update of Q [8 x nb+1] @@ -576,17 +425,11 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int x4 = _mm_mul_pd(x4, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); - y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); - y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2)); - y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2)); -#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); -#endif q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); @@ -602,20 +445,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int _mm_store_pd(&q[6],q4); h2 = _mm_loaddup_pd(&hh[ldh+1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); - _mm_store_pd(&q[ldq],q1); - q2 = _mm_load_pd(&q[ldq+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); - _mm_store_pd(&q[ldq+2],q2); - q3 = _mm_load_pd(&q[ldq+4]); - q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3)); - _mm_store_pd(&q[ldq+4],q3); - q4 = _mm_load_pd(&q[ldq+6]); - q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4)); - _mm_store_pd(&q[ldq+6],q4); -#else + q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); @@ -628,27 +458,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int q4 = _mm_load_pd(&q[ldq+6]); q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); _mm_store_pd(&q[ldq+6],q4); -#endif for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); - _mm_store_pd(&q[i*ldq],q1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); - _mm_store_pd(&q[(i*ldq)+2],q2); - q3 = _mm_load_pd(&q[(i*ldq)+4]); - q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2))); - _mm_store_pd(&q[(i*ldq)+4],q3); - q4 = _mm_load_pd(&q[(i*ldq)+6]); - q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2))); - _mm_store_pd(&q[(i*ldq)+6],q4); -#else q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); @@ -661,24 +476,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int q4 = _mm_load_pd(&q[(i*ldq)+6]); q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); _mm_store_pd(&q[(i*ldq)+6],q4); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - q1 = _mm_macc_pd(x1, h1, q1); - _mm_store_pd(&q[nb*ldq],q1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - q2 = _mm_macc_pd(x2, h1, q2); - _mm_store_pd(&q[(nb*ldq)+2],q2); - q3 = _mm_load_pd(&q[(nb*ldq)+4]); - q3 = _mm_macc_pd(x3, h1, q3); - _mm_store_pd(&q[(nb*ldq)+4],q3); - q4 = _mm_load_pd(&q[(nb*ldq)+6]); - q4 = _mm_macc_pd(x4, h1, q4); - _mm_store_pd(&q[(nb*ldq)+6],q4); -#else + q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); @@ -691,7 +492,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int q4 = _mm_load_pd(&q[(nb*ldq)+6]); q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); _mm_store_pd(&q[(nb*ldq)+6],q4); -#endif } /** @@ -717,51 +517,30 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; -#ifdef __ELPA_USE_FMA__ - __m128d q1 = _mm_load_pd(q); - __m128d y1 = _mm_macc_pd(x1, h1, q1); - __m128d q2 = _mm_load_pd(&q[2]); - __m128d y2 = _mm_macc_pd(x2, h1, q2); -#else __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); -#endif for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); - y2 = _mm_macc_pd(q2, h2, y2); -#else + q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); q2 = _mm_load_pd(&q[(i*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - x1 = _mm_macc_pd(q1, h1, x1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - x2 = _mm_macc_pd(q2, h1, x2); -#else + q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif ///////////////////////////////////////////////////// // Rank-2 update of Q [12 x nb+1] @@ -776,13 +555,9 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int x2 = _mm_mul_pd(x2, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2)); - y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2)); -#else + y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); -#endif q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); @@ -792,58 +567,33 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int _mm_store_pd(&q[2],q2); h2 = _mm_loaddup_pd(&hh[ldh+1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1)); - _mm_store_pd(&q[ldq],q1); - q2 = _mm_load_pd(&q[ldq+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2)); - _mm_store_pd(&q[ldq+2],q2); -#else + q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); q2 = _mm_load_pd(&q[ldq+2]); q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); _mm_store_pd(&q[ldq+2],q2); -#endif for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[i*ldq]); - q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))); - _mm_store_pd(&q[i*ldq],q1); - q2 = _mm_load_pd(&q[(i*ldq)+2]); - q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))); - _mm_store_pd(&q[(i*ldq)+2],q2); -#else q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); _mm_store_pd(&q[(i*ldq)+2],q2); -#endif } h1 = _mm_loaddup_pd(&hh[nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_load_pd(&q[nb*ldq]); - q1 = _mm_macc_pd(x1, h1, q1); - _mm_store_pd(&q[nb*ldq],q1); - q2 = _mm_load_pd(&q[(nb*ldq)+2]); - q2 = _mm_macc_pd(x2, h1, q2); - _mm_store_pd(&q[(nb*ldq)+2],q2); -#else + q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb*ldq)+2],q2); -#endif } diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c index 109a0004..664d0434 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c @@ -238,15 +238,6 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); - w1 = _mm_macc_pd(a2_1, h_4_2, w1); - w1 = _mm_macc_pd(a1_1, h_4_1, w1); - register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); - z1 = _mm_macc_pd(a1_1, h_3_1, z1); - register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); - register __m128d x1 = a1_1; -#else register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); @@ -254,22 +245,12 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); register __m128d x1 = a1_1; -#endif __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); __m128d a3_2 = _mm_load_pd(&q[ldq+2]); __m128d a4_2 = _mm_load_pd(&q[0+2]); -#ifdef __ELPA_USE_FMA__ - register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); - w2 = _mm_macc_pd(a2_2, h_4_2, w2); - w2 = _mm_macc_pd(a1_2, h_4_1, w2); - register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); - z2 = _mm_macc_pd(a1_2, h_3_1, z2); - register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); - register __m128d x2 = a1_2; -#else register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); @@ -277,22 +258,12 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); register __m128d x2 = a1_2; -#endif __m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]); __m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]); __m128d a3_3 = _mm_load_pd(&q[ldq+4]); __m128d a4_3 = _mm_load_pd(&q[0+4]); -#ifdef __ELPA_USE_FMA__ - register __m128d w3 = _mm_macc_pd(a3_3, h_4_3, a4_3); - w3 = _mm_macc_pd(a2_3, h_4_2, w3); - w3 = _mm_macc_pd(a1_3, h_4_1, w3); - register __m128d z3 = _mm_macc_pd(a2_3, h_3_2, a3_3); - z3 = _mm_macc_pd(a1_3, h_3_1, z3); - register __m128d y3 = _mm_macc_pd(a1_3, h_2_1, a2_3); - register __m128d x3 = a1_3; -#else register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3)); w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2)); w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1)); @@ -300,7 +271,6 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1)); register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1)); register __m128d x3 = a1_3; -#endif __m128d q1; __m128d q2; @@ -318,48 +288,27 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(i*ldq)+2]); q3 = _mm_load_pd(&q[(i*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - x3 = _mm_macc_pd(q3, h1, x3); -#else x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); -#endif h2 = _mm_loaddup_pd(&hh[ldh+i-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); - y3 = _mm_macc_pd(q3, h2, y3); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); -#endif h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); - z3 = _mm_macc_pd(q3, h3, z3); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); -#endif h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); - w2 = _mm_macc_pd(q2, h4, w2); - w3 = _mm_macc_pd(q3, h4, w3); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-3]); @@ -368,78 +317,47 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(nb*ldq)+2]); q3 = _mm_load_pd(&q[(nb*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - x3 = _mm_macc_pd(q3, h1, x3); -#else x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); -#endif h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); - y3 = _mm_macc_pd(q3, h2, y3); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); -#endif h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); - z3 = _mm_macc_pd(q3, h3, z3); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); -#endif h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - x3 = _mm_macc_pd(q3, h1, x3); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); -#endif h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); - y3 = _mm_macc_pd(q3, h2, y3); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); -#endif h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - x3 = _mm_macc_pd(q3, h1, x3); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [6 x nb+3] @@ -457,15 +375,10 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); - y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2)); - y3 = _mm_msub_pd(y3, h1, _mm_mul_pd(x3,h2)); -#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); -#endif __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); @@ -474,15 +387,10 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); - z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); - z3 = _mm_msub_pd(z3, h1, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2))); -#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))); -#endif __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); @@ -493,15 +401,10 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); - w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); - w3 = _mm_msub_pd(w3, h1, _mm_macc_pd(z3, h4, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2)))); -#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)))); -#endif q1 = _mm_load_pd(&q[0]); q2 = _mm_load_pd(&q[2]); @@ -517,15 +420,11 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[ldq]); q2 = _mm_load_pd(&q[ldq+2]); q3 = _mm_load_pd(&q[ldq+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); - q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2)); - q3 = _mm_sub_pd(q3, _mm_macc_pd(w3, h4, z3)); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4))); -#endif + _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[ldq+2],q2); _mm_store_pd(&q[ldq+4],q3); @@ -537,25 +436,16 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_sub_pd(q1, y1); q2 = _mm_sub_pd(q2, y2); q3 = _mm_sub_pd(q3, y3); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); - q3 = _mm_nmacc_pd(w3, h4, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); -#endif h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); - q3 = _mm_nmacc_pd(z3, h3, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); -#endif + _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); _mm_store_pd(&q[(ldq*2)+4],q3); @@ -567,35 +457,22 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_sub_pd(q1, x1); q2 = _mm_sub_pd(q2, x2); q3 = _mm_sub_pd(q3, x3); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); - q3 = _mm_nmacc_pd(w3, h4, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); - q3 = _mm_nmacc_pd(y3, h2, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); - q3 = _mm_nmacc_pd(z3, h3, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); -#endif _mm_store_pd(&q[ldq*3], q1); _mm_store_pd(&q[(ldq*3)+2], q2); _mm_store_pd(&q[(ldq*3)+4], q3); @@ -608,48 +485,27 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(i*ldq)+2]); q3 = _mm_load_pd(&q[(i*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); - q3 = _mm_nmacc_pd(x3, h1, q3); -#else q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1)); -#endif h2 = _mm_loaddup_pd(&hh[ldh+i-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); - q3 = _mm_nmacc_pd(y3, h2, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2)); -#endif h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); - q3 = _mm_nmacc_pd(z3, h3, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3)); -#endif h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); - q3 = _mm_nmacc_pd(w3, h4, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4)); -#endif _mm_store_pd(&q[i*ldq],q1); _mm_store_pd(&q[(i*ldq)+2],q2); @@ -660,35 +516,23 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q3 = _mm_load_pd(&q[(nb*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); - q3 = _mm_nmacc_pd(x3, h1, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); - q3 = _mm_nmacc_pd(y3, h2, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); - q3 = _mm_nmacc_pd(z3, h3, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); -#endif + _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); _mm_store_pd(&q[(nb*ldq)+4],q3); @@ -697,25 +541,17 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); - q3 = _mm_nmacc_pd(x3, h1, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); - q3 = _mm_nmacc_pd(y3, h2, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); -#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); _mm_store_pd(&q[((nb+1)*ldq)+4],q3); @@ -724,15 +560,11 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); - q3 = _mm_nmacc_pd(x3, h1, q3); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); -#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); _mm_store_pd(&q[((nb+2)*ldq)+4],q3); @@ -764,15 +596,6 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); - w1 = _mm_macc_pd(a2_1, h_4_2, w1); - w1 = _mm_macc_pd(a1_1, h_4_1, w1); - __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); - z1 = _mm_macc_pd(a1_1, h_3_1, z1); - __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); - __m128d x1 = a1_1; -#else __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); @@ -780,22 +603,12 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); __m128d x1 = a1_1; -#endif __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); __m128d a3_2 = _mm_load_pd(&q[ldq+2]); __m128d a4_2 = _mm_load_pd(&q[0+2]); -#ifdef __ELPA_USE_FMA__ - __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); - w2 = _mm_macc_pd(a2_2, h_4_2, w2); - w2 = _mm_macc_pd(a1_2, h_4_1, w2); - __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); - z2 = _mm_macc_pd(a1_2, h_3_1, z2); - __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); - __m128d x2 = a1_2; -#else __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); @@ -803,7 +616,6 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); __m128d x2 = a1_2; -#endif __m128d q1; __m128d q2; @@ -821,30 +633,18 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - z1 = _mm_macc_pd(q1, h3, z1); - w1 = _mm_macc_pd(q1, h4, w1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); -#endif q2 = _mm_load_pd(&q[(i*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x2 = _mm_macc_pd(q2, h1, x2); - y2 = _mm_macc_pd(q2, h2, y2); - z2 = _mm_macc_pd(q2, h3, z2); - w2 = _mm_macc_pd(q2, h4, w2); -#else + x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-3]); @@ -854,21 +654,12 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); -#else x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); -#endif h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); @@ -876,30 +667,18 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] @@ -923,36 +702,24 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); - y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2)); -#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); -#endif h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); - z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); -#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); -#endif h1 = tau4; h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); - w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); -#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); -#endif q1 = _mm_load_pd(&q[0]); q2 = _mm_load_pd(&q[2]); @@ -964,13 +731,10 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq]); q2 = _mm_load_pd(&q[ldq+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); - q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2)); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); -#endif + _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[ldq+2],q2); @@ -978,13 +742,9 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_load_pd(&q[ldq*2]); q2 = _mm_load_pd(&q[(ldq*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))); - q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))); -#endif _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); @@ -993,13 +753,10 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_load_pd(&q[ldq*3]); q2 = _mm_load_pd(&q[(ldq*3)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))))); - q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_macc_pd(y2, h2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4))))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))))); -#endif + _mm_store_pd(&q[ldq*3], q1); _mm_store_pd(&q[(ldq*3)+2], q2); @@ -1011,19 +768,15 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); -#endif + _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_macc_pd(w2, h4, _mm_mul_pd(z2, h3)), _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2)))); -#else + q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)))); -#endif + _mm_store_pd(&q[(i*ldq)+2],q2); } @@ -1032,13 +785,10 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2)))); - q2 = _mm_sub_pd(q2, _mm_macc_pd(x2, h1, _mm_macc_pd(z2, h3, _mm_mul_pd(y2, h2)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1))); -#endif + _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); @@ -1046,26 +796,20 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1))); - q2 = _mm_sub_pd(q2, _mm_macc_pd(y2, h2, _mm_mul_pd(x2, h1))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1))); -#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); } @@ -1096,15 +840,6 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); - w1 = _mm_macc_pd(a2_1, h_4_2, w1); - w1 = _mm_macc_pd(a1_1, h_4_1, w1); - __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); - z1 = _mm_macc_pd(a1_1, h_3_1, z1); - __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); - __m128d x1 = a1_1; -#else __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); @@ -1112,7 +847,6 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); __m128d x1 = a1_1; -#endif __m128d q1; @@ -1129,51 +863,33 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - z1 = _mm_macc_pd(q1, h3, z1); - w1 = _mm_macc_pd(q1, h4, w1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-3]); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); - z1 = _mm_macc_pd(q1, h3, z1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); -#endif h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - y1 = _mm_macc_pd(q1, h2, y1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [2 x nb+3] ///////////////////////////////////////////////////// @@ -1195,30 +911,21 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2)); -#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); -#endif h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); -#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); -#endif h1 = tau4; h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); -#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); -#endif q1 = _mm_load_pd(&q[0]); q1 = _mm_sub_pd(q1, w1); @@ -1226,32 +933,26 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1)); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); -#endif + _mm_store_pd(&q[ldq],q1); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_load_pd(&q[ldq*2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); -#endif + _mm_store_pd(&q[ldq*2],q1); h2 = _mm_loaddup_pd(&hh[ldh+1]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_load_pd(&q[ldq*3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); -#endif + _mm_store_pd(&q[ldq*3], q1); for (i = 4; i < nb; i++) @@ -1262,11 +963,9 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); -#endif + _mm_store_pd(&q[i*ldq],q1); } @@ -1274,29 +973,23 @@ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2)))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); -#endif + _mm_store_pd(&q[nb*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1))); -#else + q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); -#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); } diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c index f9cacbc8..bc19037c 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c +++ b/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c @@ -414,58 +414,39 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1); - t1 = _mm_macc_pd(a4_1, h_6_4, t1); - t1 = _mm_macc_pd(a3_1, h_6_3, t1); - t1 = _mm_macc_pd(a2_1, h_6_2, t1); - t1 = _mm_macc_pd(a1_1, h_6_1, t1); -#else + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); -#endif + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1); - v1 = _mm_macc_pd(a3_1, h_5_3, v1); - v1 = _mm_macc_pd(a2_1, h_5_2, v1); - v1 = _mm_macc_pd(a1_1, h_5_1, v1); -#else + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); -#endif + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); - w1 = _mm_macc_pd(a2_1, h_4_2, w1); - w1 = _mm_macc_pd(a1_1, h_4_1, w1); -#else + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); -#endif + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); - z1 = _mm_macc_pd(a1_1, h_3_1, z1); - register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); -#else + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); -#endif + register __m128d x1 = a1_1; __m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]); @@ -475,23 +456,6 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int __m128d a5_2 = _mm_load_pd(&q[(ldq)+2]); __m128d a6_2 = _mm_load_pd(&q[2]); -#ifdef __ELPA_USE_FMA__ - register __m128d t2 = _mm_macc_pd(a5_2, h_6_5, a6_2); - t2 = _mm_macc_pd(a4_2, h_6_4, t2); - t2 = _mm_macc_pd(a3_2, h_6_3, t2); - t2 = _mm_macc_pd(a2_2, h_6_2, t2); - t2 = _mm_macc_pd(a1_2, h_6_1, t2); - register __m128d v2 = _mm_macc_pd(a4_2, h_5_4, a5_2); - v2 = _mm_macc_pd(a3_2, h_5_3, v2); - v2 = _mm_macc_pd(a2_2, h_5_2, v2); - v2 = _mm_macc_pd(a1_2, h_5_1, v2); - register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2); - w2 = _mm_macc_pd(a2_2, h_4_2, w2); - w2 = _mm_macc_pd(a1_2, h_4_1, w2); - register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2); - z2 = _mm_macc_pd(a1_2, h_3_1, z2); - register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2); -#else register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5)); t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4)); t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3)); @@ -507,7 +471,7 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); -#endif + register __m128d x2 = a1_2; __m128d q1; @@ -525,189 +489,120 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); - w2 = _mm_macc_pd(q2, h4, w2); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_macc_pd(q1, h5, v1); - v2 = _mm_macc_pd(q2, h5, v2); -#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - t1 = _mm_macc_pd(q1, h6, t1); - t2 = _mm_macc_pd(q2, h6, t2); -#else + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6)); -#endif } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); - w2 = _mm_macc_pd(q2, h4, w2); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_macc_pd(q1, h5, v1); - v2 = _mm_macc_pd(q2, h5, v2); -#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); -#endif h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); - w2 = _mm_macc_pd(q2, h4, w2); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); -#endif h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); - z2 = _mm_macc_pd(q2, h3, z2); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); -#endif h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); - y2 = _mm_macc_pd(q2, h2, y2); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); -#endif h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); - x2 = _mm_macc_pd(q2, h1, x2); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); -#endif ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products @@ -720,26 +615,18 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); h2 = _mm_mul_pd(tau2, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2)); - y2 = _mm_msub_pd(y2, tau2, _mm_mul_pd(x2,h2)); -#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2)); -#endif __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); h2 = _mm_mul_pd(tau3, vs_1_3); h3 = _mm_mul_pd(tau3, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); - z2 = _mm_msub_pd(z2, tau3, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))); -#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); -#endif __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); @@ -748,13 +635,9 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int h3 = _mm_mul_pd(tau4, vs_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); h4 = _mm_mul_pd(tau4, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); - w2 = _mm_msub_pd(w2, tau4, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); -#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); -#endif __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); @@ -765,13 +648,9 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); h4 = _mm_mul_pd(tau5, vs_3_5); h5 = _mm_mul_pd(tau5, vs_4_5); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); - v2 = _mm_msub_pd(v2, tau5, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))); -#else + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); -#endif __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); @@ -784,13 +663,9 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int h4 = _mm_mul_pd(tau6, vs_3_6); h5 = _mm_mul_pd(tau6, vs_4_6); h6 = _mm_mul_pd(tau6, vs_5_6); -#ifdef __ELPA_USE_FMA__ - t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))))); - t2 = _mm_msub_pd(t2, tau6, _mm_macc_pd(v2, h6, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))))); -#else + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))))); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] @@ -808,13 +683,10 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(ldq+2)]); q1 = _mm_sub_pd(q1, v1); q2 = _mm_sub_pd(q2, v2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[(ldq+2)],q2); @@ -823,21 +695,14 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(ldq*2)+2]); q1 = _mm_sub_pd(q1, w1); q2 = _mm_sub_pd(q2, w2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); @@ -846,29 +711,20 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(ldq*3)+2]); q1 = _mm_sub_pd(q1, z1); q2 = _mm_sub_pd(q2, z2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[ldq*3],q1); _mm_store_pd(&q[(ldq*3)+2],q2); @@ -877,37 +733,25 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(ldq*4)+2]); q1 = _mm_sub_pd(q1, y1); q2 = _mm_sub_pd(q2, y2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[ldq*4],q1); _mm_store_pd(&q[(ldq*4)+2],q2); @@ -916,45 +760,30 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q2 = _mm_load_pd(&q[(ldq*5)+2]); q1 = _mm_sub_pd(q1, x1); q2 = _mm_sub_pd(q2, x2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[ldq*5],q1); _mm_store_pd(&q[(ldq*5)+2],q2); @@ -963,53 +792,35 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); h1 = _mm_loaddup_pd(&hh[i-5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); - q2 = _mm_nmacc_pd(t2, h6, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); -#endif + _mm_store_pd(&q[i*ldq],q1); _mm_store_pd(&q[(i*ldq)+2],q2); } @@ -1017,145 +828,100 @@ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); - q2 = _mm_nmacc_pd(v2, h5, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); -#endif + _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); - q2 = _mm_nmacc_pd(w2, h4, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); -#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); - q2 = _mm_nmacc_pd(z2, h3, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); -#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); - q2 = _mm_nmacc_pd(y2, h2, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); -#endif + _mm_store_pd(&q[(nb+3)*ldq],q1); _mm_store_pd(&q[((nb+3)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); - q2 = _mm_nmacc_pd(x2, h1, q2); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); -#endif + _mm_store_pd(&q[(nb+4)*ldq],q1); _mm_store_pd(&q[((nb+4)*ldq)+2],q2); } @@ -1186,58 +952,39 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1); - t1 = _mm_macc_pd(a4_1, h_6_4, t1); - t1 = _mm_macc_pd(a3_1, h_6_3, t1); - t1 = _mm_macc_pd(a2_1, h_6_2, t1); - t1 = _mm_macc_pd(a1_1, h_6_1, t1); -#else + register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); -#endif + __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1); - v1 = _mm_macc_pd(a3_1, h_5_3, v1); - v1 = _mm_macc_pd(a2_1, h_5_2, v1); - v1 = _mm_macc_pd(a1_1, h_5_1, v1); -#else + register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); -#endif + __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1); - w1 = _mm_macc_pd(a2_1, h_4_2, w1); - w1 = _mm_macc_pd(a1_1, h_4_1, w1); -#else + register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); -#endif + __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1); - z1 = _mm_macc_pd(a1_1, h_3_1, z1); - register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1); -#else + register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); -#endif + register __m128d x1 = a1_1; __m128d q1; @@ -1253,142 +1000,96 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int { h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_load_pd(&q[i*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_macc_pd(q1, h5, v1); -#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - t1 = _mm_macc_pd(q1, h6, t1); -#else + t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); -#endif + } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_macc_pd(q1, h5, v1); -#else + v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); -#endif + h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_macc_pd(q1, h4, w1); -#else + w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); -#endif h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_macc_pd(q1, h3, z1); -#else + z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); -#endif h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_macc_pd(q1, h2, y1); -#else + y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); -#endif h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm_macc_pd(q1, h1, x1); -#else + x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); -#endif ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products @@ -1400,22 +1101,16 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); h2 = _mm_mul_pd(tau2, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2)); -#else + y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); -#endif __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); h2 = _mm_mul_pd(tau3, vs_1_3); h3 = _mm_mul_pd(tau3, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))); -#else + z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); -#endif __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); @@ -1424,11 +1119,8 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int h3 = _mm_mul_pd(tau4, vs_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); h4 = _mm_mul_pd(tau4, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); -#else + w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); -#endif __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); @@ -1439,11 +1131,8 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); h4 = _mm_mul_pd(tau5, vs_3_5); h5 = _mm_mul_pd(tau5, vs_4_5); -#ifdef __ELPA_USE_FMA__ - v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))); -#else + v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); -#endif __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); @@ -1456,11 +1145,8 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int h4 = _mm_mul_pd(tau6, vs_3_6); h5 = _mm_mul_pd(tau6, vs_4_6); h6 = _mm_mul_pd(tau6, vs_5_6); -#ifdef __ELPA_USE_FMA__ - t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))))); -#else + t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [2 x nb+3] @@ -1473,257 +1159,185 @@ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_sub_pd(q1, v1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[ldq],q1); h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); q1 = _mm_load_pd(&q[ldq*2]); q1 = _mm_sub_pd(q1, w1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[ldq*2],q1); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq*3]); q1 = _mm_sub_pd(q1, z1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[ldq*3],q1); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); q1 = _mm_load_pd(&q[ldq*4]); q1 = _mm_sub_pd(q1, y1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[ldq*4],q1); h2 = _mm_loaddup_pd(&hh[(ldh)+1]); q1 = _mm_load_pd(&q[ldq*5]); q1 = _mm_sub_pd(q1, x1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[ldq*5],q1); for (i = 6; i < nb; i++) { q1 = _mm_load_pd(&q[i*ldq]); h1 = _mm_loaddup_pd(&hh[i-5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(t1, h6, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); -#endif + _mm_store_pd(&q[i*ldq],q1); } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(v1, h5, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); -#endif + _mm_store_pd(&q[nb*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(w1, h4, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); -#endif + _mm_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(z1, h3, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); -#endif + _mm_store_pd(&q[(nb+2)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(y1, h2, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); -#endif + _mm_store_pd(&q[(nb+3)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm_nmacc_pd(x1, h1, q1); -#else + q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); -#endif + _mm_store_pd(&q[(nb+4)*ldq],q1); } -- GitLab