Commit 35e63340 authored by Andreas Marek's avatar Andreas Marek
Browse files

Smaller step size in real SSE BLOCK2 kenrel

parent f4495fb0
......@@ -46,6 +46,8 @@
//
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
......@@ -75,14 +77,20 @@
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_6_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_10_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_12_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
......@@ -133,6 +141,9 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute
// 2 householder vectors simultaneously
......@@ -149,54 +160,984 @@ void double_hh_trafo_real_sse_2hv_single(float* q, float* hh, int* pnb, int* pnq
}
// Production level kernel calls with padding
for (i = 0; i < nq-8; i+=12)
{
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-10; i+=12)
{
hh_trafo_kernel_12_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
for (i = 0; i < nq-20; i+=24)
{
hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 10)
{
hh_trafo_kernel_10_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 10;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 20)
{
hh_trafo_kernel_20_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 20;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 6)
{
hh_trafo_kernel_6_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 6;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 12)
{
hh_trafo_kernel_12_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2)
{
hh_trafo_kernel_2_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 2;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
if (worked_on != nq)
{
// printf("Error in real SSE BLOCK2 kernel \n");
// abort();
}
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 12 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 24 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
else
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(&q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(&q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(&q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(&q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(&q[ldq+4*offset]);
__SSE_DATATYPE x6 = _SSE_LOAD(&q[ldq+5*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
#endif
__SSE_DATATYPE h2;
__SSE_DATATYPE q1 = _SSE_LOAD(q);
__SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
__SSE_DATATYPE q2 = _SSE_LOAD(&q[offset]);
__SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
__SSE_DATATYPE q3 = _SSE_LOAD(&q[2*offset]);
__SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
__SSE_DATATYPE q4 = _SSE_LOAD(&q[3*offset]);
__SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
__SSE_DATATYPE q5 = _SSE_LOAD(&q[4*offset]);
__SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
__SSE_DATATYPE q6 = _SSE_LOAD(&q[5*offset]);
__SSE_DATATYPE y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
if (nq-i > 4)
{
hh_trafo_kernel_8_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i > 0)
{
hh_trafo_kernel_4_SSE_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
q4 = _SSE_LOAD(&q[(i*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
q5 = _SSE_LOAD(&q[(i*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
q6 = _SSE_LOAD(&q[(i*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2));
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
q4 = _SSE_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
q5 = _SSE_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
q6 = _SSE_LOAD(&q[(nb*ldq)+5*offset]);
x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
#endif
h1 = _SSE_XOR(tau1, sign);
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1);
x4 = _SSE_MUL(x4, h1);
x5 = _SSE_MUL(x5, h1);
x6 = _SSE_MUL(x6, h1);
h1 = _SSE_XOR(tau2, sign);
h2 = _SSE_MUL(h1, vs);
y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
y6 = _SSE_ADD(_SSE_MUL(y6,h1), _SSE_MUL(x6,h2));
q1 = _SSE_LOAD(q);
q1 = _SSE_ADD(q1, y1);
_SSE_STORE(q,q1);
q2 = _SSE_LOAD(&q[offset]);
q2 = _SSE_ADD(q2, y2);
_SSE_STORE(&q[offset],q2);
q3 = _SSE_LOAD(&q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE(&q[2*offset],q3);
q4 = _SSE_LOAD(&q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE(&q[3*offset],q4);
q5 = _SSE_LOAD(&q[4*offset]);
q5 = _SSE_ADD(q5, y5);
_SSE_STORE(&q[4*offset],q5);
q6 = _SSE_LOAD(&q[5*offset]);
q6 = _SSE_ADD(q6, y6);
_SSE_STORE(&q[5*offset],q6);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
#endif
q1 = _SSE_LOAD(&q[ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
_SSE_STORE(&q[ldq],q1);
q2 = _SSE_LOAD(&q[ldq+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
_SSE_STORE(&q[ldq+offset],q2);
q3 = _SSE_LOAD(&q[ldq+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
_SSE_STORE(&q[ldq+2*offset],q3);
q4 = _SSE_LOAD(&q[ldq+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
_SSE_STORE(&q[ldq+3*offset],q4);
q5 = _SSE_LOAD(&q[ldq+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
_SSE_STORE(&q[ldq+4*offset],q5);
q6 = _SSE_LOAD(&q[ldq+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(x6, _SSE_MUL(y6, h2)));
_SSE_STORE(&q[ldq+5*offset],q6);
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[i-1]));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+i]));
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
_SSE_STORE(&q[i*ldq],q1);
q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
_SSE_STORE(&q[(i*ldq)+offset],q2);
q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
_SSE_STORE(&q[(i*ldq)+2*offset],q3);
q4 = _SSE_LOAD(&q[(i*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
_SSE_STORE(&q[(i*ldq)+3*offset],q4);
q5 = _SSE_LOAD(&q[(i*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
_SSE_STORE(&q[(i*ldq)+4*offset],q5);
q6 = _SSE_LOAD(&q[(i*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_ADD(_SSE_MUL(x6,h1), _SSE_MUL(y6, h2)));
_SSE_STORE(&q[(i*ldq)+5*offset],q6);
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[nb-1]));
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
_SSE_STORE(&q[nb*ldq],q1);
q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
_SSE_STORE(&q[(nb*ldq)+offset],q2);
q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
_SSE_STORE(&q[(nb*ldq)+2*offset],q3);
q4 = _SSE_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
_SSE_STORE(&q[(nb*ldq)+3*offset],q4);
q5 = _SSE_LOAD(&q[(nb*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
_SSE_STORE(&q[(nb*ldq)+4*offset],q5);
q6 = _SSE_LOAD(&q[(nb*ldq)+5*offset]);
q6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
_SSE_STORE(&q[(nb*ldq)+5*offset],q6);
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 10 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 20 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_10_SSE_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_20_SSE_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
__SSE_DATATYPE x1 = _SSE_LOAD(&q[ldq]);
__SSE_DATATYPE x2 = _SSE_LOAD(&q[ldq+offset]);
__SSE_DATATYPE x3 = _SSE_LOAD(&q[ldq+2*offset]);
__SSE_DATATYPE x4 = _SSE_LOAD(&q[ldq+3*offset]);
__SSE_DATATYPE x5 = _SSE_LOAD(&q[ldq+4*offset]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_loaddup_pd(&hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
#endif
__SSE_DATATYPE h2;
__SSE_DATATYPE q1 = _SSE_LOAD(q);
__SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
__SSE_DATATYPE q2 = _SSE_LOAD(&q[offset]);
__SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
__SSE_DATATYPE q3 = _SSE_LOAD(&q[2*offset]);
__SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
__SSE_DATATYPE q4 = _SSE_LOAD(&q[3*offset]);
__SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
__SSE_DATATYPE q5 = _SSE_LOAD(&q[4*offset]);
__SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
for(i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
q4 = _SSE_LOAD(&q[(i*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
q5 = _SSE_LOAD(&q[(i*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
q4 = _SSE_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
q5 = _SSE_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_loaddup_pd(hh);
__SSE_DATATYPE tau2 = _mm_loaddup_pd(&hh[ldh]);
__SSE_DATATYPE vs = _mm_loaddup_pd(&s);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE tau1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) hh)));
__SSE_DATATYPE tau2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh])));
__SSE_DATATYPE vs = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd((double*) &s)));
#endif
h1 = _SSE_XOR(tau1, sign);
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1);
x4 = _SSE_MUL(x4, h1);
x5 = _SSE_MUL(x5, h1);
h1 = _SSE_XOR(tau2, sign);
h2 = _SSE_MUL(h1, vs);
y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
q1 = _SSE_LOAD(q);
q1 = _SSE_ADD(q1, y1);
_SSE_STORE(q,q1);
q2 = _SSE_LOAD(&q[offset]);
q2 = _SSE_ADD(q2, y2);
_SSE_STORE(&q[offset],q2);
q3 = _SSE_LOAD(&q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE(&q[2*offset],q3);
q4 = _SSE_LOAD(&q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE(&q[3*offset],q4);
q5 = _SSE_LOAD(&q[4*offset]);
q5 = _SSE_ADD(q5, y5);
_SSE_STORE(&q[4*offset],q5);
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_loaddup_pd(&hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+1]));
#endif
q1 = _SSE_LOAD(&q[ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
_SSE_STORE(&q[ldq],q1);
q2 = _SSE_LOAD(&q[ldq+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
_SSE_STORE(&q[ldq+offset],q2);
q3 = _SSE_LOAD(&q[ldq+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
_SSE_STORE(&q[ldq+2*offset],q3);
q4 = _SSE_LOAD(&q[ldq+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
_SSE_STORE(&q[ldq+3*offset],q4);
q5 = _SSE_LOAD(&q[ldq+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
_SSE_STORE(&q[ldq+4*offset],q5);
for (i = 2; i < nb; i++)
{
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[i-1]);
h2 = _mm_loaddup_pd(&hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[i-1]));
h2 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i])));
// h2 = _mm_castpd_ps(_mm_loaddup_pd(&hh[ldh+i]));
#endif
q1 = _SSE_LOAD(&q[i*ldq]);
q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
_SSE_STORE(&q[i*ldq],q1);
q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
_SSE_STORE(&q[(i*ldq)+offset],q2);
q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
_SSE_STORE(&q[(i*ldq)+2*offset],q3);
q4 = _SSE_LOAD(&q[(i*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
_SSE_STORE(&q[(i*ldq)+3*offset],q4);
q5 = _SSE_LOAD(&q[(i*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
_SSE_STORE(&q[(i*ldq)+4*offset],q5);
}
#ifdef DOUBLE_PRECISION_REAL
h1 = _mm_loaddup_pd(&hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1])));
// h1 = _mm_castpd_ps(_mm_loaddup_pd(&hh[nb-1]));
#endif
q1 = _SSE_LOAD(&q[nb*ldq]);
q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
_SSE_STORE(&q[nb*ldq],q1);
q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
_SSE_STORE(&q[(nb*ldq)+offset],q2);
q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
_SSE_STORE(&q[(nb*ldq)+2*offset],q3);
q4 = _SSE_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
_SSE_STORE(&q[(nb*ldq)+3*offset],q4);
q5 = _SSE_LOAD(&q[(nb*ldq)+4*offset]);
q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
_SSE_STORE(&q[(nb*ldq)+4*offset],q5);
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 16 rows of Q simultaneously, a
#endif