Commit 46ed16fd authored by Andreas Marek's avatar Andreas Marek

Double precision real block4 kernel for Sparc64

parent c111ef44
......@@ -51,5 +51,5 @@
#include "../../general/precision_macros.h"
#include "real_sse_6hv_template.c"
#undef REALCASE
#undef SINGLE__PRECISION
#undef SINGLE_PRECISION
......@@ -62,6 +62,15 @@
#include "config-f90.h"
#ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h>
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
......@@ -82,10 +91,6 @@
#define _SSE_STORE _mm_store_ps
#endif
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE_INTRINSICS
......@@ -93,6 +98,7 @@
#endif
//Forward declaration
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
......@@ -103,13 +109,36 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb,
__forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_4_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
#endif
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
/*
!f>#ifdef HAVE_SSE_INTRINSICS
......@@ -125,6 +154,20 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_sparc64_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_sparc64_4hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
......@@ -139,11 +182,36 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_sparc64_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_sparc64_4hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
{
......@@ -206,14 +274,25 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-4; i+=6)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_6_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 6;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-8; i+=12)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 12;
}
......@@ -226,7 +305,12 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
#ifdef DOUBLE_PRECISION_REAL
if (nq-i ==4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 4;
}
#endif
......@@ -234,7 +318,12 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
#ifdef SINGLE_PRECISION_REAL
if (nq-i ==8)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 8;
}
#endif
......@@ -242,7 +331,13 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 2;
}
#endif
......@@ -250,14 +345,25 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
#ifdef SINGLE_PRECISION_REAL
if (nq-i ==4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
worked_on += 4;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
#ifdef HAVE_SSE_INTRINSICS
printf("Error in real SSE BLOCK4 kernel \n");
#endif
#ifdef HAVE_SPARC64_SSE
printf("Error in real SPARC64 BLOCK4 kernel \n");
#endif
abort();
}
#endif
......@@ -275,12 +381,22 @@ void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq,
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [6 x nb+3] * hh
......@@ -293,6 +409,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
__SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq]);
__SSE_DATATYPE a4_1 = _SSE_LOAD(&q[0]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]);
__SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]);
......@@ -310,6 +427,27 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
__m128 h_4_2 = _mm_set1_ps(hh[(ldh*3)+2]);
__m128 h_4_1 = _mm_set1_ps(hh[(ldh*3)+3]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
__SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
__SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
__SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]);
__SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]);
__SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
#ifdef SINGLE_PRECISION_REAL
__m128 h_2_1 = _mm_set_ps(hh[ldh+1], hh[ldh+1]); // h_2_1 contains four times hh[ldh+1]
__m128 h_3_2 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]);
__m128 h_3_1 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]);
__m128 h_4_3 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]);
__m128 h_4_2 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]);
__m128 h_4_1 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
#endif
......@@ -358,11 +496,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
for(i = 4; i < nb; i++)
{
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[i-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-3]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set_pd(hh[i-3], hh[i-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[i-3], hh[i-3]);
#endif
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
......@@ -372,45 +521,90 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[ldh+i-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+i-2]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]);
#endif
#endif
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set1_pd(hh[(ldh*2)+i-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3 = _mm_set1_ps(hh[(ldh*2)+i-1]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#endif
z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h4 = _mm_set1_pd(hh[(ldh*3)+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h4 = _mm_set1_ps(hh[(ldh*3)+i]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#endif
w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
w3 = _SSE_ADD(w3, _SSE_MUL(q3,h4));
}
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[nb-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-3]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set_pd(hh[nb-3], hh[nb-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-3], hh[nb-3]);
#endif
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);
......@@ -419,33 +613,68 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[ldh+nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+nb-2]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
#endif
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#endif
z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-2]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set_pd(hh[nb-2], hh[nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-2], hh[nb-2]);
#endif
#endif
q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);
q3 = _SSE_LOAD(&q[((nb+1)*ldq)+2*offset]);
......@@ -454,22 +683,48 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[(ldh*1)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+nb-1]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set_pd(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set_ps(hh[ldh+nb-1], hh[(ldh*1)+nb-1]);
#endif
#endif
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set_pd(hh[nb-1], hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set_ps(hh[nb-1], hh[nb-1]);
#endif
#endif
q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);
q3 = _SSE_LOAD(&q[((nb+2)*ldq)+2*offset]);
......@@ -482,18 +737,31 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
// Rank-1 update of Q [6 x nb+3]
/////////////////////////////////////////////////////
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
#endif
#ifdef SINGLE_PRECISION_REAL
__m128 tau1 = _mm_set1_ps(hh[0]);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]);
#endif
#ifdef SINGLE_PRECISION_REAL
__m128 tau1 = _mm_set_ps(hh[0], hh[0]);
#endif
#endif
h1 = tau1;
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1);
x3 = _SSE_MUL(x3, h1)
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs_1_2 = _mm_set1_pd(s_1_2);
......@@ -502,6 +770,19 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
__m128 tau2 = _mm_set1_ps(hh[ldh]);
__m128 vs_1_2 = _mm_set1_ps(s_1_2);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]);
__SSE_DATATYPE vs_1_2 = _mm_set_pd(s_1_2, s_1_2);
#endif
#ifdef SINGLE_PRECISION_REAL
__m128 tau2 = _mm_set_ps(hh[ldh], hh[ldh]);
__m128 vs_1_2 = _mm_set_ps(s_1_2, s_1_2);
#endif
#endif
h1 = tau2;
h2 = _SSE_MUL(h1, vs_1_2);
......@@ -510,6 +791,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
y2 = _SSE_SUB(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
y3 = _SSE_SUB(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau3 = _mm_set1_pd(hh[ldh*2]);
__SSE_DATATYPE vs_1_3 = _mm_set1_pd(s_1_3);
......@@ -520,6 +802,21 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
__m128 vs_1_3 = _mm_set1_ps(s_1_3);
__m128 vs_2_3 = _mm_set1_ps(s_2_3);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau3 = _mm_set_pd(hh[ldh*2], hh[ldh*2]);
__SSE_DATATYPE vs_1_3 = _mm_set_pd(s_1_3, s_1_3);
__SSE_DATATYPE vs_2_3 = _mm_set_pd(s_2_3, s_2_3);
#endif
#ifdef SINGLE_PRECISION_REAL
__m128 tau3 = _mm_set_ps(hh[ldh*2], hh[ldh*2]);
__m128 vs_1_3 = _mm_set_ps(s_1_3, s_1_3);
__m128 vs_2_3 = _mm_set_ps(s_2_3, s_2_3);
#endif
#endif
h1 = tau3;
h2 = _SSE_MUL(h1, vs_1_3);
......@@ -529,6 +826,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
z2 = _SSE_SUB(_SSE_MUL(z2,h1), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)));
z3 = _SSE_SUB(_SSE_MUL(z3,h1), _SSE_ADD(_SSE_MUL(y3,h3), _SSE_MUL(x3,h2)));
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau4 = _mm_set1_pd(hh[ldh*3]);
__SSE_DATATYPE vs_1_4 = _mm_set1_pd(s_1_4);
......@@ -540,6 +838,22 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
__m128 vs_1_4 = _mm_set1_ps(s_1_4);
__m128 vs_2_4 = _mm_set1_ps(s_2_4);
__m128 vs_3_4 = _mm_set1_ps(s_3_4);
#endif
#endif
#ifdef HAVE_SPARC64_SSE