From 46ed16fd81f8c6390d25d2578c8d2e14211eae5e Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Thu, 23 Nov 2017 11:35:09 +0100 Subject: [PATCH] Double precision real block4 kernel for Sparc64 --- .../real_sparc64_6hv_single_precision.c | 2 +- src/elpa2/kernels/real_sse_4hv_template.c | 970 +++++++++++++++++- 2 files changed, 963 insertions(+), 9 deletions(-) diff --git a/src/elpa2/kernels/real_sparc64_6hv_single_precision.c b/src/elpa2/kernels/real_sparc64_6hv_single_precision.c index b0f3428d..38421ea6 100644 --- a/src/elpa2/kernels/real_sparc64_6hv_single_precision.c +++ b/src/elpa2/kernels/real_sparc64_6hv_single_precision.c @@ -51,5 +51,5 @@ #include "../../general/precision_macros.h" #include "real_sse_6hv_template.c" #undef REALCASE -#undef SINGLE__PRECISION +#undef SINGLE_PRECISION diff --git a/src/elpa2/kernels/real_sse_4hv_template.c b/src/elpa2/kernels/real_sse_4hv_template.c index b8fc8af7..9b4ea528 100644 --- a/src/elpa2/kernels/real_sse_4hv_template.c +++ b/src/elpa2/kernels/real_sse_4hv_template.c @@ -62,6 +62,15 @@ #include "config-f90.h" +#ifdef HAVE_SSE_INTRINSICS +#include +#endif +#ifdef HAVE_SPARC64_SSE +#include +#include +#endif +#include +#include #ifdef DOUBLE_PRECISION_REAL #define offset 2 @@ -82,10 +91,6 @@ #define _SSE_STORE _mm_store_ps #endif -#include -#include -#include - #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_SSE_INTRINSICS @@ -93,6 +98,7 @@ #endif //Forward declaration +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); @@ -103,13 +109,36 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_2_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_4_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); +#endif +#ifdef SINGLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); +__forceinline void hh_trafo_kernel_8_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); +__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); +#endif +#endif +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif #ifdef SINGLE_PRECISION_REAL void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif +#ifdef SINGLE_PRECISION_REAL +void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); +#endif +#endif /* !f>#ifdef HAVE_SSE_INTRINSICS @@ -125,6 +154,20 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, !f>#endif */ +/* +!f>#ifdef HAVE_SPARC64_SSE +!f> interface +!f> subroutine quad_hh_trafo_real_sparc64_4hv_double(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="quad_hh_trafo_real_sparc64_4hv_double") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_double) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface @@ -139,11 +182,36 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, !f>#endif */ +/* +!f>#ifdef HAVE_SPARC64_SSE +!f> interface +!f> subroutine quad_hh_trafo_real_sparc64_4hv_single(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="quad_hh_trafo_real_sparc64_4hv_single") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_float) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) #endif #ifdef SINGLE_PRECISION_REAL void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh) +#endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) +#endif +#ifdef SINGLE_PRECISION_REAL +void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh) +#endif + #endif { @@ -206,14 +274,25 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, #ifdef DOUBLE_PRECISION_REAL for (i = 0; i < nq-4; i+=6) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_6_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif worked_on += 6; } #endif #ifdef SINGLE_PRECISION_REAL for (i = 0; i < nq-8; i+=12) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_12_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_12_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif + worked_on += 12; } @@ -226,7 +305,12 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, #ifdef DOUBLE_PRECISION_REAL if (nq-i ==4) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_4_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif worked_on += 4; } #endif @@ -234,7 +318,12 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, #ifdef SINGLE_PRECISION_REAL if (nq-i ==8) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_8_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_8_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif worked_on += 8; } #endif @@ -242,7 +331,13 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, #ifdef DOUBLE_PRECISION_REAL if (nq-i == 2) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_2_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif + worked_on += 2; } #endif @@ -250,14 +345,25 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, #ifdef SINGLE_PRECISION_REAL if (nq-i ==4) { +#ifdef HAVE_SSE_INTRINSICS hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif +#ifdef HAVE_SPARC64_SSE + hh_trafo_kernel_4_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); +#endif worked_on += 4; } #endif #ifdef WITH_DEBUG if (worked_on != nq) { +#ifdef HAVE_SSE_INTRINSICS printf("Error in real SSE BLOCK4 kernel \n"); +#endif +#ifdef HAVE_SPARC64_SSE + printf("Error in real SPARC64 BLOCK4 kernel \n"); +#endif + abort(); } #endif @@ -275,12 +381,22 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, * matrix Vector product with two householder * vectors + a rank 1 update is performed */ +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) #endif #ifdef SINGLE_PRECISION_REAL __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +#endif +#ifdef SINGLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) +#endif +#endif { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [6 x nb+3] * hh @@ -293,6 +409,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq]); __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[0]); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]); __SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]); @@ -310,6 +427,27 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb __m128 h_4_2 = _mm_set1_ps(hh[(ldh*3)+2]); __m128 h_4_1 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); + __SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif + +#ifdef SINGLE_PRECISION_REAL + __m128 h_2_1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); // h_2_1 contains four times hh[ldh+1] + __m128 h_3_2 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __m128 h_3_1 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __m128 h_4_3 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __m128 h_4_2 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __m128 h_4_1 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#endif @@ -358,11 +496,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb for(i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[i-3]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); +#endif #endif q1 = _SSE_LOAD(&q[i*ldq]); q2 = _SSE_LOAD(&q[(i*ldq)+offset]); @@ -372,45 +521,90 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1)); x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+i-2]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+i-2]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); +#endif + #endif y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2)); y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+i-1]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); +#endif +#endif + z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3)); z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3)); - +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+i]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+i]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#endif + w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4)); w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4)); w3 = _SSE_ADD(w3, _SSE_MUL(q3,h4)); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-3]); #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); +#endif +#endif q1 = _SSE_LOAD(&q[nb*ldq]); q2 = _SSE_LOAD(&q[(nb*ldq)+offset]); q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]); @@ -419,33 +613,68 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1)); x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+nb-2]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+nb-2]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); +#endif #endif y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2)); y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#endif + z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3)); z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]); q3 = _SSE_LOAD(&q[((nb+1)*ldq)+2*offset]); @@ -454,22 +683,48 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1)); x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[(ldh*1)+nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+nb-1], hh[(ldh*1)+nb-1]); +#endif +#endif + + y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2)); y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif +#endif + + q1 = _SSE_LOAD(&q[(nb+2)*ldq]); q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]); q3 = _SSE_LOAD(&q[((nb+2)*ldq)+2*offset]); @@ -482,18 +737,31 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb // Rank-1 update of Q [6 x nb+3] ///////////////////////////////////////////////////// +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]); #endif #ifdef SINGLE_PRECISION_REAL __m128 tau1 = _mm_set1_ps(hh[0]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 tau1 = _mm_set_ps(hh[0], hh[0]); +#endif +#endif + h1 = tau1; x1 = _SSE_MUL(x1, h1); x2 = _SSE_MUL(x2, h1); - x3 = _SSE_MUL(x3, h1); + x3 = _SSE_MUL(x3, h1) +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]); __SSE_DATATYPE vs_1_2 = _mm_set1_pd(s_1_2); @@ -502,6 +770,19 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb __m128 tau2 = _mm_set1_ps(hh[ldh]); __m128 vs_1_2 = _mm_set1_ps(s_1_2); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]); + __SSE_DATATYPE vs_1_2 = _mm_set_pd(s_1_2, s_1_2); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 tau2 = _mm_set_ps(hh[ldh], hh[ldh]); + __m128 vs_1_2 = _mm_set_ps(s_1_2, s_1_2); +#endif +#endif + h1 = tau2; h2 = _SSE_MUL(h1, vs_1_2); @@ -510,6 +791,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb y2 = _SSE_SUB(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2)); y3 = _SSE_SUB(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau3 = _mm_set1_pd(hh[ldh*2]); __SSE_DATATYPE vs_1_3 = _mm_set1_pd(s_1_3); @@ -520,6 +802,21 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb __m128 vs_1_3 = _mm_set1_ps(s_1_3); __m128 vs_2_3 = _mm_set1_ps(s_2_3); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau3 = _mm_set_pd(hh[ldh*2], hh[ldh*2]); + __SSE_DATATYPE vs_1_3 = _mm_set_pd(s_1_3, s_1_3); + __SSE_DATATYPE vs_2_3 = _mm_set_pd(s_2_3, s_2_3); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 tau3 = _mm_set_ps(hh[ldh*2], hh[ldh*2]); + __m128 vs_1_3 = _mm_set_ps(s_1_3, s_1_3); + __m128 vs_2_3 = _mm_set_ps(s_2_3, s_2_3); +#endif +#endif + h1 = tau3; h2 = _SSE_MUL(h1, vs_1_3); @@ -529,6 +826,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb z2 = _SSE_SUB(_SSE_MUL(z2,h1), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))); z3 = _SSE_SUB(_SSE_MUL(z3,h1), _SSE_ADD(_SSE_MUL(y3,h3), _SSE_MUL(x3,h2))); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau4 = _mm_set1_pd(hh[ldh*3]); __SSE_DATATYPE vs_1_4 = _mm_set1_pd(s_1_4); @@ -540,6 +838,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb __m128 vs_1_4 = _mm_set1_ps(s_1_4); __m128 vs_2_4 = _mm_set1_ps(s_2_4); __m128 vs_3_4 = _mm_set1_ps(s_3_4); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau4 = _mm_set_pd(hh[ldh*3], hh[ldh*3]); + __SSE_DATATYPE vs_1_4 = _mm_set_pd(s_1_4, s_1_4); + __SSE_DATATYPE vs_2_4 = _mm_set_pd(s_2_4, s_2_4); + __SSE_DATATYPE vs_3_4 = _mm_set_pd(s_3_4, s_3_4); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 tau4 = _mm_set_ps(hh[ldh*3], hh[ldh*3]); + __m128 vs_1_4 = _mm_set_ps(s_1_4, s_1_4); + __m128 vs_2_4 = _mm_set_ps(s_2_4, s_2_4); + __m128 vs_3_4 = _mm_set_ps(s_3_4, s_3_4); +#endif #endif h1 = tau4; @@ -561,12 +875,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[offset],q2); _SSE_STORE(&q[2*offset],q3); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+1]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq]); q2 = _SSE_LOAD(&q[ldq+offset]); @@ -580,12 +906,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[ldq+offset],q2); _SSE_STORE(&q[ldq+2*offset],q3); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+2]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*2]); q2 = _SSE_LOAD(&q[(ldq*2)+offset]); q3 = _SSE_LOAD(&q[(ldq*2)+2*offset]); @@ -597,12 +935,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4)); q3 = _SSE_SUB(q3, _SSE_MUL(w3, h4)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+1]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+1]); #endif +#endif + +#ifdef HAVE_SPARC64_INTRINSICS +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3)); q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3)); q3 = _SSE_SUB(q3, _SSE_MUL(z3, h3)); @@ -611,12 +961,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[(ldq*2)+offset],q2); _SSE_STORE(&q[(ldq*2)+2*offset],q3); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+3]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*3]); q2 = _SSE_LOAD(&q[(ldq*3)+offset]); q3 = _SSE_LOAD(&q[(ldq*3)+2*offset]); @@ -628,23 +990,47 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4)); q3 = _SSE_SUB(q3, _SSE_MUL(w3, h4)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+1]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2)); q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2)); q3 = _SSE_SUB(q3, _SSE_MUL(y3, h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+2]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3)); q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3)); @@ -655,11 +1041,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb for (i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[i-3]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); +#endif #endif q1 = _SSE_LOAD(&q[i*ldq]); @@ -670,33 +1067,68 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb q2 = _SSE_SUB(q2, _SSE_MUL(x2,h1)); q3 = _SSE_SUB(q3, _SSE_MUL(x3,h1)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+i-2]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+i-2]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); +#endif #endif q1 = _SSE_SUB(q1, _SSE_MUL(y1,h2)); q2 = _SSE_SUB(q2, _SSE_MUL(y2,h2)); q3 = _SSE_SUB(q3, _SSE_MUL(y3,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+i-1]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(z1,h3)); q2 = _SSE_SUB(q2, _SSE_MUL(z2,h3)); q3 = _SSE_SUB(q3, _SSE_MUL(z3,h3)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+i]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+i]); #endif +#endif + +#ifdef HAVE_SPRC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(w1,h4)); q2 = _SSE_SUB(q2, _SSE_MUL(w2,h4)); q3 = _SSE_SUB(q3, _SSE_MUL(w3,h4)); @@ -706,12 +1138,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[(i*ldq)+2*offset],q3); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); +#endif +#endif + q1 = _SSE_LOAD(&q[nb*ldq]); q2 = _SSE_LOAD(&q[(nb*ldq)+offset]); q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]); @@ -720,24 +1164,47 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1)); q3 = _SSE_SUB(q3, _SSE_MUL(x3, h1)); +#ifdef HAVE_SSE_INTRINSCS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+nb-2]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+nb-2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); +#endif +#endif q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2)); q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2)); q3 = _SSE_SUB(q3, _SSE_MUL(y3, h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#endif + q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3)); q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3)); @@ -747,12 +1214,24 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[(nb*ldq)+offset],q2); _SSE_STORE(&q[(nb*ldq)+2*offset],q3); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]); q3 = _SSE_LOAD(&q[((nb+1)*ldq)+2*offset]); @@ -761,11 +1240,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1)); q3 = _SSE_SUB(q3, _SSE_MUL(x3, h1)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h2 = _mm_set1_ps(hh[ldh+nb-1]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif #endif q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2)); @@ -776,11 +1266,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb _SSE_STORE(&q[((nb+1)*ldq)+offset],q2); _SSE_STORE(&q[((nb+1)*ldq)+2*offset],q3); +#ifdef HAVE_SSE_INTRINSICS +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set1_pd(hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set1_ps(hh[nb-1]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE #ifdef DOUBLE_PRECISION_REAL - h1 = _mm_set1_pd(hh[nb-1]); + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL - h1 = _mm_set1_ps(hh[nb-1]); + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif #endif q1 = _SSE_LOAD(&q[(nb+2)*ldq]); @@ -807,12 +1308,23 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb * matrix Vector product with two householder * vectors + a rank 1 update is performed */ +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) #endif #ifdef SINGLE_PRECISION_REAL __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_4_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +#endif +#ifdef SINGLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_8_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) +#endif +#endif + { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh @@ -825,6 +1337,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq]); __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[0]); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]); __SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]); @@ -842,6 +1355,27 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, __m128 h_4_2 = _mm_set1_ps(hh[(ldh*3)+2]); __m128 h_4_1 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); + __SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif + +#ifdef SINGLE_PRECISION_REAL + __m128 h_2_1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); // h_2_1 contains four times hh[ldh+1] + __m128 h_3_2 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __m128 h_3_1 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __m128 h_4_3 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __m128 h_4_2 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __m128 h_4_1 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#endif __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3)); @@ -876,6 +1410,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, for(i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); h2 = _mm_set1_pd(hh[ldh+i-2]); @@ -888,6 +1423,23 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h2 = _mm_set1_ps(hh[ldh+i-2]); h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); h4 = _mm_set1_ps(hh[(ldh*3)+i]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif + +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif #endif q1 = _SSE_LOAD(&q[i*ldq]); @@ -905,6 +1457,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4)); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); h2 = _mm_set1_pd(hh[ldh+nb-2]); @@ -914,6 +1467,20 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h1 = _mm_set1_ps(hh[nb-3]); h2 = _mm_set1_ps(hh[ldh+nb-2]); h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif #endif q1 = _SSE_LOAD(&q[nb*ldq]); @@ -926,6 +1493,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); h2 = _mm_set1_pd(hh[(ldh*1)+nb-1]); @@ -934,6 +1502,19 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h1 = _mm_set1_ps(hh[nb-2]); h2 = _mm_set1_ps(hh[(ldh*1)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); + h2 = _mm_set_pd(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); + h2 = _mm_set_ps(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]); @@ -943,12 +1524,24 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+2)*ldq]); q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]); @@ -960,6 +1553,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, // Rank-1 update of Q [4 x nb+3] ///////////////////////////////////////////////////// +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]); __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]); @@ -987,6 +1581,37 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, __m128 vs_2_4 = _mm_set1_ps(s_2_4); __m128 vs_3_4 = _mm_set1_ps(s_3_4); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]); + __SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]); + __SSE_DATATYPE tau3 = _mm_set_pd(hh[ldh*2], hh[ldh*2]); + __SSE_DATATYPE tau4 = _mm_set_pd(hh[ldh*3], hh[ldh*3]); + + __SSE_DATATYPE vs_1_2 = _mm_set_pd(s_1_2, s_1_2); + __SSE_DATATYPE vs_1_3 = _mm_set_pd(s_1_3, s_1_3); + __SSE_DATATYPE vs_2_3 = _mm_set_pd(s_2_3, s_2_3); + __SSE_DATATYPE vs_1_4 = _mm_set_pd(s_1_4, s_1_4); + __SSE_DATATYPE vs_2_4 = _mm_set_pd(s_2_4, s_2_4); + __SSE_DATATYPE vs_3_4 = _mm_set_pd(s_3_4, s_3_4); +#endif + +#ifdef SINGLE_PRECISION_REAL + __m128 tau1 = _mm_set_ps(hh[0], hh[0]); + __m128 tau2 = _mm_set_ps(hh[ldh], hh[ldh]); + __m128 tau3 = _mm_set_ps(hh[ldh*2], hh[ldh*2]); + __m128 tau4 = _mm_set_ps(hh[ldh*3], hh[ldh*3]); + + __m128 vs_1_2 = _mm_set_ps(s_1_2, s_1_2); + __m128 vs_1_3 = _mm_set_ps(s_1_3, s_1_3); + __m128 vs_2_3 = _mm_set_ps(s_2_3, s_2_3); + __m128 vs_1_4 = _mm_set_ps(s_1_4, s_1_4); + __m128 vs_2_4 = _mm_set_ps(s_2_4, s_2_4); + __m128 vs_3_4 = _mm_set_ps(s_3_4, s_3_4); +#endif +#endif h1 = tau1; @@ -1021,12 +1646,24 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[0],q1); _SSE_STORE(&q[offset],q2); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+1]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq]); q2 = _SSE_LOAD(&q[ldq+offset]); @@ -1036,6 +1673,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[ldq],q1); _SSE_STORE(&q[ldq+offset],q2); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+1]); h4 = _mm_set1_pd(hh[(ldh*3)+2]); @@ -1044,6 +1682,19 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h3 = _mm_set1_ps(hh[(ldh*2)+1]); h4 = _mm_set1_ps(hh[(ldh*3)+2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); + h4 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); + h4 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*2]); q2 = _SSE_LOAD(&q[(ldq*2)+offset]); @@ -1053,6 +1704,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[ldq*2],q1); _SSE_STORE(&q[(ldq*2)+offset],q2); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+1]); h3 = _mm_set1_pd(hh[(ldh*2)+2]); @@ -1064,6 +1716,22 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h4 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); + h3 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); + h4 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); + h3 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); + h4 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); + +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*3]); q2 = _SSE_LOAD(&q[(ldq*3)+offset]); @@ -1075,6 +1743,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, for (i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); h2 = _mm_set1_pd(hh[ldh+i-2]); @@ -1087,6 +1756,23 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); h4 = _mm_set1_ps(hh[(ldh*3)+i]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#endif + q1 = _SSE_LOAD(&q[i*ldq]); @@ -1101,6 +1787,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[(i*ldq)+offset],q2); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); h2 = _mm_set1_pd(hh[ldh+nb-2]); @@ -1111,6 +1798,21 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h2 = _mm_set1_ps(hh[ldh+nb-2]); h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[nb*ldq]); q2 = _SSE_LOAD(&q[(nb*ldq)+offset]); @@ -1121,6 +1823,7 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[nb*ldq],q1); _SSE_STORE(&q[(nb*ldq)+offset],q2); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); h2 = _mm_set1_pd(hh[ldh+nb-1]); @@ -1129,6 +1832,19 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, h1 = _mm_set1_ps(hh[nb-2]); h2 = _mm_set1_ps(hh[ldh+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); + h2 = _mm_set_pd(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); + h2 = _mm_set_ps(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]); @@ -1139,12 +1855,24 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[(nb+1)*ldq],q1); _SSE_STORE(&q[((nb+1)*ldq)+offset],q2); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+2)*ldq]); q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]); @@ -1165,12 +1893,23 @@ __forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, * matrix Vector product with two householder * vectors + a rank 1 update is performed */ +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) #endif #ifdef SINGLE_PRECISION_REAL __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) #endif +#endif +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_2_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) +#endif +#ifdef SINGLE_PRECISION_REAL +__forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) +#endif +#endif + { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [2 x nb+3] * hh @@ -1183,6 +1922,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq]); __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[0]); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]); __SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]); @@ -1199,6 +1939,27 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, __m128 h_4_2 = _mm_set1_ps(hh[(ldh*3)+2]); __m128 h_4_1 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); + __SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 h_2_1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); // h_2_1 contains four times hh[ldh+1] + __m128 h_3_2 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); + __m128 h_3_1 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); + __m128 h_4_3 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); + __m128 h_4_2 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); + __m128 h_4_1 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#endif + __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3)); w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2)); w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1)); @@ -1216,6 +1977,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, for(i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); h2 = _mm_set1_pd(hh[ldh+i-2]); @@ -1228,6 +1990,23 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); h4 = _mm_set1_ps(hh[(ldh*3)+i]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#endif + q1 = _SSE_LOAD(&q[i*ldq]); x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1)); @@ -1236,6 +2015,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4)); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); h2 = _mm_set1_pd(hh[ldh+nb-2]); @@ -1246,12 +2026,28 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h2 = _mm_set1_ps(hh[ldh+nb-2]); h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[nb*ldq]); x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1)); y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); h2 = _mm_set1_pd(hh[(ldh*1)+nb-1]); @@ -1260,17 +2056,42 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h1 = _mm_set1_ps(hh[nb-2]); h2 = _mm_set1_ps(hh[(ldh*1)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); + h2 = _mm_set_pd(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); + h2 = _mm_set_ps(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1)); y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2)); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+2)*ldq]); x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1)); @@ -1278,6 +2099,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, // Rank-1 update of Q [2 x nb+3] ///////////////////////////////////////////////////// +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]); __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]); @@ -1304,6 +2126,37 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, __m128 vs_2_4 = _mm_set1_ps(s_2_4); __m128 vs_3_4 = _mm_set1_ps(s_3_4); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + __SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]); + __SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]); + __SSE_DATATYPE tau3 = _mm_set_pd(hh[ldh*2], hh[ldh*2]); + __SSE_DATATYPE tau4 = _mm_set_pd(hh[ldh*3], hh[ldh*3]); + + __SSE_DATATYPE vs_1_2 = _mm_set_pd(s_1_2, s_1_2); + __SSE_DATATYPE vs_1_3 = _mm_set_pd(s_1_3, s_1_3); + __SSE_DATATYPE vs_2_3 = _mm_set_pd(s_2_3, s_2_3); + __SSE_DATATYPE vs_1_4 = _mm_set_pd(s_1_4, s_1_4); + __SSE_DATATYPE vs_2_4 = _mm_set_pd(s_2_4, s_2_4); + __SSE_DATATYPE vs_3_4 = _mm_set_pd(s_3_4, s_3_4); +#endif +#ifdef SINGLE_PRECISION_REAL + __m128 tau1 = _mm_set1_ps(hh[0], hh[0]); + __m128 tau2 = _mm_set1_ps(hh[ldh], hh[ldh]); + __m128 tau3 = _mm_set1_ps(hh[ldh*2], hh[ldh*2]); + __m128 tau4 = _mm_set1_ps(hh[ldh*3], hh[ldh*3]); + + __m128 vs_1_2 = _mm_set1_ps(s_1_2, s_1_2); + __m128 vs_1_3 = _mm_set1_ps(s_1_3, s_1_3); + __m128 vs_2_3 = _mm_set1_ps(s_2_3, s_2_3); + __m128 vs_1_4 = _mm_set1_ps(s_1_4, s_1_4); + __m128 vs_2_4 = _mm_set1_ps(s_2_4, s_2_4); + __m128 vs_3_4 = _mm_set1_ps(s_3_4, s_3_4); +#endif +#endif + h1 = tau1; x1 = _SSE_MUL(x1, h1); @@ -1330,18 +2183,31 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, q1 = _SSE_SUB(q1, w1); _SSE_STORE(&q[0],q1); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h4 = _mm_set1_pd(hh[(ldh*3)+1]); #endif #ifdef SINGLE_PRECISION_REAL h4 = _mm_set1_ps(hh[(ldh*3)+1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h4 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h4 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq]); q1 = _SSE_SUB(q1, _SSE_ADD(z1, _SSE_MUL(w1, h4))); _SSE_STORE(&q[ldq],q1); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h3 = _mm_set1_pd(hh[(ldh*2)+1]); h4 = _mm_set1_pd(hh[(ldh*3)+2]); @@ -1350,12 +2216,26 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h3 = _mm_set1_ps(hh[(ldh*2)+1]); h4 = _mm_set1_ps(hh[(ldh*3)+2]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]); + h4 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#ifdef SINGLE_PRECISION_REAL + h3 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]); + h4 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*2]); q1 = _SSE_SUB(q1, _SSE_ADD(y1, _SSE_ADD(_SSE_MUL(z1, h3), _SSE_MUL(w1, h4)))); _SSE_STORE(&q[ldq*2],q1); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h2 = _mm_set1_pd(hh[ldh+1]); h3 = _mm_set1_pd(hh[(ldh*2)+2]); @@ -1366,6 +2246,21 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h3 = _mm_set1_ps(hh[(ldh*2)+2]); h4 = _mm_set1_ps(hh[(ldh*3)+3]); #endif +#endif + +#ifdef HAVE_SPARC64 +#ifdef DOUBLE_PRECISION_REAL + h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]); + h3 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]); + h4 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#ifdef SINGLE_PRECISION_REAL + h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); + h3 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]); + h4 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]); +#endif +#endif + q1 = _SSE_LOAD(&q[ldq*3]); q1 = _SSE_SUB(q1, _SSE_ADD(x1, _SSE_ADD(_SSE_MUL(y1, h2), _SSE_ADD(_SSE_MUL(z1, h3), _SSE_MUL(w1, h4))))); @@ -1374,6 +2269,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, for (i = 4; i < nb; i++) { +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[i-3]); h2 = _mm_set1_pd(hh[ldh+i-2]); @@ -1385,6 +2281,22 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h2 = _mm_set1_ps(hh[ldh+i-2]); h3 = _mm_set1_ps(hh[(ldh*2)+i-1]); h4 = _mm_set1_ps(hh[(ldh*3)+i]); +#endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[i-3], hh[i-3]); + h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[i-3], hh[i-3]); + h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]); + h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]); + h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]); +#endif #endif q1 = _SSE_LOAD(&q[i*ldq]); @@ -1394,6 +2306,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, _SSE_STORE(&q[i*ldq],q1); } +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-3]); h2 = _mm_set1_pd(hh[ldh+nb-2]); @@ -1404,12 +2317,28 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h2 = _mm_set1_ps(hh[ldh+nb-2]); h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-3], hh[nb-3]); + h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-3], hh[nb-3]); + h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]); + h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[nb*ldq]); q1 = _SSE_SUB(q1, _SSE_ADD(_SSE_ADD(_SSE_MUL(z1, h3), _SSE_MUL(y1, h2)) , _SSE_MUL(x1, h1))); _SSE_STORE(&q[nb*ldq],q1); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-2]); h2 = _mm_set1_pd(hh[ldh+nb-1]); @@ -1418,18 +2347,43 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, h1 = _mm_set1_ps(hh[nb-2]); h2 = _mm_set1_ps(hh[ldh+nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-2], hh[nb-2]); + h2 = _mm_set_pd(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-2], hh[nb-2]); + h2 = _mm_set_ps(hh[ldh+nb-1], hh[ldh+nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+1)*ldq]); q1 = _SSE_SUB(q1, _SSE_ADD( _SSE_MUL(y1, h2) , _SSE_MUL(x1, h1))); _SSE_STORE(&q[(nb+1)*ldq],q1); +#ifdef HAVE_SSE_INTRINSICS #ifdef DOUBLE_PRECISION_REAL h1 = _mm_set1_pd(hh[nb-1]); #endif #ifdef SINGLE_PRECISION_REAL h1 = _mm_set1_ps(hh[nb-1]); #endif +#endif + +#ifdef HAVE_SPARC64_SSE +#ifdef DOUBLE_PRECISION_REAL + h1 = _mm_set_pd(hh[nb-1], hh[nb-1]); +#endif +#ifdef SINGLE_PRECISION_REAL + h1 = _mm_set_ps(hh[nb-1], hh[nb-1]); +#endif +#endif + q1 = _SSE_LOAD(&q[(nb+2)*ldq]); q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1)); -- GitLab