Commit 72d5ce75 authored by Andreas Marek's avatar Andreas Marek
Browse files

Update VSX kernel files

parent e2398529
......@@ -48,12 +48,1879 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET VSX_SSE
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
#undef VEC_SET
#ifdef HAVE_VSX_SSE
#include <altivec.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define __SSE_DATATYPE __vector double
#define _SSE_LOAD vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
#define _SSE_STORE vec_st
#define offset 2
#endif
#ifdef SINGLE_PRECISION_REAL
#define __SSE_DATATYPE __vector float
#define _SSE_LOAD vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
#define _SSE_STORE vec_st
#define offset 4
#endif
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_6_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_10_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_12_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_20_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_VSX_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_VSX_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_vsx_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute
// 2 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double s = hh[(ldh)+1]*1.0;
#endif
#ifdef SINGLE_PRECISION_REAL
float s = hh[(ldh)+1]*1.0;
#endif
#ifdef HAVE_SSE_INTRINSICS
#pragma ivdep
#endif
for (i = 2; i < nb; i++)
{
s += hh[i-1] * hh[(i+ldh)];
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-10; i+=12)
{
hh_trafo_kernel_12_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24)
{
hh_trafo_kernel_24_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 10)
{
hh_trafo_kernel_10_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 10;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 20)
{
hh_trafo_kernel_20_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 20;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 6)
{
hh_trafo_kernel_6_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 6;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 12)
{
hh_trafo_kernel_12_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2)
{
hh_trafo_kernel_2_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 2;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in real VSX BLOCK2 kernel %d %d\n", worked_on, nq);
abort();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 12 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 24 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(0 ,&q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, &q[ldq+4*offset]);
__SSE_DATATYPE x6 = _SSE_LOAD(0, &q[ldq+5*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
#endif
#endif
__SSE_DATATYPE h2;
__SSE_DATATYPE q1 = _SSE_LOAD(0, q);
__SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
__SSE_DATATYPE q2 = _SSE_LOAD(0, &q[offset]);
__SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
__SSE_DATATYPE q3 = _SSE_LOAD(0, &q[2*offset]);
__SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
__SSE_DATATYPE q4 = _SSE_LOAD(0, &q[3*offset]);
__SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
__SSE_DATATYPE q5 = _SSE_LOAD(0, &q[4*offset]);
__SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
__SSE_DATATYPE q6 = _SSE_LOAD(0, &q[5*offset]);
__SSE_DATATYPE y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
for(i = 2; i < nb; i++)
{
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = vec_splats(hh[i-1]);
h2 = vec_splats(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vec_splats(hh[i-1]);
h2 = vec_splats(hh[ldh+i]);
#endif
#endif
q1 = _SSE_LOAD(0, &q[i*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
q2 = _SSE_LOAD(0, &q[(i*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
q3 = _SSE_LOAD(0, &q[(i*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
q4 = _SSE_LOAD(0, &q[(i*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
q5 = _SSE_LOAD(0, &q[(i*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
q6 = _SSE_LOAD(0, &q[(i*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2));
}
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = vec_splats(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vec_spalts(hh[nb-1]);
#endif
#endif
q1 = _SSE_LOAD(0,&q[nb*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
q2 = _SSE_LOAD(0, &q[(nb*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
q3 = _SSE_LOAD(0, &q[(nb*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
q4 = _SSE_LOAD(0, &q[(nb*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
q5 = _SSE_LOAD(0, &q[(nb*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
q6 = _SSE_LOAD(0, &q[(nb*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_pd(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
__SSE_DATATYPE vs = _mm_set1_ps(s);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = vec_splats(hh[0]);
__SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
__SSE_DATATYPE vs = vec_splats(s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = vec_splats(hh[0]);
__SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
__SSE_DATATYPE vs = vec_splatss(s);
#endif
#endif
#ifdef HAVE_SSE_INTRINSICS
h1 = _SSE_XOR(tau1, sign);
#endif
#ifdef HAVE_VSX_SSE
h1 = vec_neg(tau1);
#endif
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1);
x4 = _SSE_MUL(x4, h1);
x5 = _SSE_MUL(x5, h1);
x6 = _SSE_MUL(x6, h1);
#ifdef HAVE_SSE_INTRINSICS
h1 = _SSE_XOR(tau2, sign);
#endif
#ifdef HAVE_SPARC64_SSE
h1 = vec_neg(tau2);
#endif
h2 = _SSE_MUL(h1, vs);
y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
y6 = _SSE_ADD(_SSE_MUL(y6,h1), _SSE_MUL(x6,h2));
q1 = _SSE_LOAD(0, q);
q1 = _SSE_ADD(q1, y1);
_SSE_STORE(q1, 0, q);
q2 = _SSE_LOAD(0,&q[offset]);
q2 = _SSE_ADD(q2, y2);
_SSE_STORE(q2, 0,&q[offset]);
q3 = _SSE_LOAD(0, &q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE(q3, 0, &q[2*offset]);
q4 = _SSE_LOAD(0, &q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE(q4, 0, &q[3*offset]);
q5 = _SSE_LOAD(0, &q[4*offset]);
q5 = _SSE_ADD(q5, y5);
_SSE_STORE(q5, 0, &q[4*offset]);
q6 = _SSE_LOAD(0, &q[5*offset]);
q6 = _SSE_ADD(q6, y6);
_SSE_STORE(q6, 0, &q[5*offset]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_set1_ps(hh[ldh+1]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
h2 = vec_splats(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = vec_splats(hh[ldh+1]);
#endif
#endif
q1 = _SSE_LOAD(0, &q[ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
_SSE_STORE(q1, 0, &q[ldq]);
q2 = _SSE_LOAD(0, &q[ldq+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
_SSE_STORE(q2, 0, &q[ldq+offset]);
q3 = _SSE_LOAD(0, &q[ldq+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
_SSE_STORE(q3, 0, &q[ldq+2*offset]);
q4 = _SSE_LOAD(0, &q[ldq+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
_SSE_STORE(q4, 0, &q[ldq+3*offset]);
q5 = _SSE_LOAD(0, &q[ldq+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
_SSE_STORE(q5, 0, &q[ldq+4*offset]);
q6 = _SSE_LOAD(0, &q[ldq+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(x6, _SSE_MUL(y6, h2)));
_SSE_STORE(q6, 0, &q[ldq+5*offset]);
for (i = 2; i < nb; i++)
{
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[i-1]);
h2 = _mm_set1_pd(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[i-1]);
h2 = _mm_set1_ps(hh[ldh+i]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = vec_splats(hh[i-1]);
h2 = vec_splats(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vec_splats(hh[i-1]);
h2 = vec_splats(hh[ldh+i]);
#endif
#endif
q1 = _SSE_LOAD(0, &q[i*ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
_SSE_STORE(q1, &q[i*ldq]);
q2 = _SSE_LOAD(0, &q[(i*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
_SSE_STORE(q2, 0, &q[(i*ldq)+offset]);
q3 = _SSE_LOAD(0, &q[(i*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
_SSE_STORE(q3, 0, &q[(i*ldq)+2*offset]);
q4 = _SSE_LOAD(0, &q[(i*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
_SSE_STORE(q4, 0, &q[(i*ldq)+3*offset]);
q5 = _SSE_LOAD(0, &q[(i*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
_SSE_STORE(q5, 0, &q[(i*ldq)+4*offset]);
q6 = _SSE_LOAD(0, &q[(i*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(_SSE_MUL(x6,h1), _SSE_MUL(y6, h2)));
_SSE_STORE(q6, 0, &q[(i*ldq)+5*offset]);
}
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_set1_pd(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_set1_ps(hh[nb-1]);
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
h1 = vec_splats(hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = vec_splats(hh[nb-1]);
#endif
#endif
q1 = _SSE_LOAD(0, &q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
_SSE_STORE(q1, 0, &q[nb*ldq]);
q2 = _SSE_LOAD(0, &q[(nb*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
_SSE_STORE(q2, 0, &q[(nb*ldq)+offset]);
q3 = _SSE_LOAD(0, &q[(nb*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
_SSE_STORE(q3, 0, &q[(nb*ldq)+2*offset]);
q4 = _SSE_LOAD(0, &q[(nb*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
_SSE_STORE(q4, 0, &q[(nb*ldq)+3*offset]);
q5 = _SSE_LOAD(0, &q[(nb*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
_SSE_STORE(q5, 0, &q[(nb*ldq)+4*offset]);
q6 = _SSE_LOAD(0, &q[(nb*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
_SSE_STORE(q6, 0, &q[(nb*ldq)+5*offset]);
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 10 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 20 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_10_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_20_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(0, &q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(0, &q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(0, &q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(0, &q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(0, &q[ldq+