Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
207febce
Commit
207febce
authored
Sep 13, 2016
by
Andreas Marek
Browse files
Comment unused code paths in AVX kernels
parent
f581320b
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
View file @
207febce
...
...
@@ -272,145 +272,145 @@ void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq,
//#endif
}
#if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 6 householder vectors simultaneously
double scalarprods[15];
// scalarprods[0] = s_1_2;
// scalarprods[1] = s_1_3;
// scalarprods[2] = s_2_3;
// scalarprods[3] = s_1_4;
// scalarprods[4] = s_2_4;
// scalarprods[5] = s_3_4;
// scalarprods[6] = s_1_5;
// scalarprods[7] = s_2_5;
// scalarprods[8] = s_3_5;
// scalarprods[9] = s_4_5;
// scalarprods[10] = s_1_6;
// scalarprods[11] = s_2_6;
// scalarprods[12] = s_3_6;
// scalarprods[13] = s_4_6;
// scalarprods[14] = s_5_6;
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// Production level kernel calls with padding
//#ifdef __AVX__
for (i = 0; i < nq; i+=8)
{
hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
//#else
// for (i = 0; i < nq; i+=4)
//#if 0
//void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//{
// int i;
// int nb = *pnb;
// int nq = *pldq;
// int ldq = *pldq;
// int ldh = *pldh;
//
// // calculating scalar products to compute
// // 6 householder vectors simultaneously
// double scalarprods[15];
//
//// scalarprods[0] = s_1_2;
//// scalarprods[1] = s_1_3;
//// scalarprods[2] = s_2_3;
//// scalarprods[3] = s_1_4;
//// scalarprods[4] = s_2_4;
//// scalarprods[5] = s_3_4;
//// scalarprods[6] = s_1_5;
//// scalarprods[7] = s_2_5;
//// scalarprods[8] = s_3_5;
//// scalarprods[9] = s_4_5;
//// scalarprods[10] = s_1_6;
//// scalarprods[11] = s_2_6;
//// scalarprods[12] = s_3_6;
//// scalarprods[13] = s_4_6;
//// scalarprods[14] = s_5_6;
//
// scalarprods[0] = hh[(ldh+1)];
// scalarprods[1] = hh[(ldh*2)+2];
// scalarprods[2] = hh[(ldh*2)+1];
// scalarprods[3] = hh[(ldh*3)+3];
// scalarprods[4] = hh[(ldh*3)+2];
// scalarprods[5] = hh[(ldh*3)+1];
// scalarprods[6] = hh[(ldh*4)+4];
// scalarprods[7] = hh[(ldh*4)+3];
// scalarprods[8] = hh[(ldh*4)+2];
// scalarprods[9] = hh[(ldh*4)+1];
// scalarprods[10] = hh[(ldh*5)+5];
// scalarprods[11] = hh[(ldh*5)+4];
// scalarprods[12] = hh[(ldh*5)+3];
// scalarprods[13] = hh[(ldh*5)+2];
// scalarprods[14] = hh[(ldh*5)+1];
//
// // calculate scalar product of first and fourth householder vector
// // loop counter = 2
// scalarprods[0] += hh[1] * hh[(2+ldh)];
// scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
//
// // loop counter = 3
// scalarprods[0] += hh[2] * hh[(3+ldh)];
// scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
//
// scalarprods[1] += hh[1] * hh[3+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
//
// // loop counter = 4
// scalarprods[0] += hh[3] * hh[(4+ldh)];
// scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
//
// scalarprods[1] += hh[2] * hh[4+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
//
// scalarprods[3] += hh[1] * hh[4+(ldh*3)];
// scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
//
// // loop counter = 5
// scalarprods[0] += hh[4] * hh[(5+ldh)];
// scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
//
// scalarprods[1] += hh[3] * hh[5+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
//
// scalarprods[3] += hh[2] * hh[5+(ldh*3)];
// scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
//
// scalarprods[6] += hh[1] * hh[5+(ldh*4)];
// scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
//
// #pragma ivdep
// for (i = 6; i < nb; i++)
// {
// hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
// scalarprods[0] += hh[i-1] * hh[(i+ldh)];
// scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
//
// scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
//
// scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
// scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
//
// scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
// scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
//
// scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
// }
//
//
// // Production level kernel calls with padding
////#ifdef __AVX__
// for (i = 0; i < nq; i+=8)
// {
// hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
// }
////#else
//// for (i = 0; i < nq; i+=4)
//// {
//// hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
//// }
////#endif
//}
//#endif
}
#endif
/**
* Unrolled kernel that computes
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
View file @
207febce
...
...
@@ -163,65 +163,67 @@ void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int*
}
}
#if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 4 householder vectors simultaneously
double s_1_2 = hh[(ldh)+1];
double s_1_3 = hh[(ldh*2)+2];
double s_2_3 = hh[(ldh*2)+1];
double s_1_4 = hh[(ldh*3)+3];
double s_2_4 = hh[(ldh*3)+2];
double s_3_4 = hh[(ldh*3)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
s_1_2 += hh[2-1] * hh[(2+ldh)];
s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
// loop counter = 3
s_1_2 += hh[3-1] * hh[(3+ldh)];
s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
s_1_3 += hh[3-2] * hh[3+(ldh*2)];
s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
#pragma ivdep
for (i = 4; i < nb; i++)
{
s_1_2 += hh[i-1] * hh[(i+ldh)];
s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
s_1_3 += hh[i-2] * hh[i+(ldh*2)];
s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
//#if 0
//void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//{
// int i;
// int nb = *pnb;
// int nq = *pldq;
// int ldq = *pldq;
// int ldh = *pldh;
//
// // calculating scalar products to compute
// // 4 householder vectors simultaneously
// double s_1_2 = hh[(ldh)+1];
// double s_1_3 = hh[(ldh*2)+2];
// double s_2_3 = hh[(ldh*2)+1];
// double s_1_4 = hh[(ldh*3)+3];
// double s_2_4 = hh[(ldh*3)+2];
// double s_3_4 = hh[(ldh*3)+1];
//
// // calculate scalar product of first and fourth householder vector
// // loop counter = 2
// s_1_2 += hh[2-1] * hh[(2+ldh)];
// s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
// s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
//
// // loop counter = 3
// s_1_2 += hh[3-1] * hh[(3+ldh)];
// s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
// s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
//
// s_1_3 += hh[3-2] * hh[3+(ldh*2)];
// s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
//
// #pragma ivdep
// for (i = 4; i < nb; i++)
// {
// s_1_2 += hh[i-1] * hh[(i+ldh)];
// s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
// s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
//
// s_1_3 += hh[i-2] * hh[i+(ldh*2)];
// s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
//
// s_1_4 += hh[i-3] * hh[i+(ldh*3)];
// }
//
// // Production level kernel calls with padding
//#ifdef __AVX__
// for (i = 0; i < nq; i+=12)
// {
// hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
// }
//#else
// for (i = 0; i < nq; i+=6)
// {
// hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
// }
//#endif
//}
//#endif
s_1_4 += hh[i-3] * hh[i+(ldh*3)];
}
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq; i+=12)
{
hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#else
for (i = 0; i < nq; i+=6)
{
hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#endif
}
#endif
/**
* Unrolled kernel that computes
* 6 rows of Q simultaneously, a
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
View file @
207febce
...
...
@@ -243,160 +243,160 @@ void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int*
}
}
#if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 6 householder vectors simultaneously
double scalarprods[15];
// scalarprods[0] = s_1_2;
// scalarprods[1] = s_1_3;
// scalarprods[2] = s_2_3;
// scalarprods[3] = s_1_4;
// scalarprods[4] = s_2_4;
// scalarprods[5] = s_3_4;
// scalarprods[6] = s_1_5;
// scalarprods[7] = s_2_5;
// scalarprods[8] = s_3_5;
// scalarprods[9] = s_4_5;
// scalarprods[10] = s_1_6;
// scalarprods[11] = s_2_6;
// scalarprods[12] = s_3_6;
// scalarprods[13] = s_4_6;
// scalarprods[14] = s_5_6;
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// printf("s_1_2: %f\n", scalarprods[0]);
// printf("s_1_3: %f\n", scalarprods[1]);
// printf("s_2_3: %f\n", scalarprods[2]);
// printf("s_1_4: %f\n", scalarprods[3]);
// printf("s_2_4: %f\n", scalarprods[4]);
// printf("s_3_4: %f\n", scalarprods[5]);
// printf("s_1_5: %f\n", scalarprods[6]);
// printf("s_2_5: %f\n", scalarprods[7]);
// printf("s_3_5: %f\n", scalarprods[8]);
// printf("s_4_5: %f\n", scalarprods[9]);
// printf("s_1_6: %f\n", scalarprods[10]);
// printf("s_2_6: %f\n", scalarprods[11]);
// printf("s_3_6: %f\n", scalarprods[12]);
// printf("s_4_6: %f\n", scalarprods[13]);
// printf("s_5_6: %f\n", scalarprods[14]);
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq; i+=8)
{
hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#else
for (i = 0; i < nq; i+=4)
{
hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#endif
}
#endif
//
#if 0
//
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//
{
//
int i;
//
int nb = *pnb;
//
int nq = *pldq;
//
int ldq = *pldq;
//
int ldh = *pldh;
//
//
// calculating scalar products to compute
//
// 6 householder vectors simultaneously
//
double scalarprods[15];
//
//
// scalarprods[0] = s_1_2;
//
// scalarprods[1] = s_1_3;
//
// scalarprods[2] = s_2_3;
//
// scalarprods[3] = s_1_4;
//
// scalarprods[4] = s_2_4;
//
// scalarprods[5] = s_3_4;
//
// scalarprods[6] = s_1_5;
//
// scalarprods[7] = s_2_5;
//
// scalarprods[8] = s_3_5;
//
// scalarprods[9] = s_4_5;
//
// scalarprods[10] = s_1_6;
//
// scalarprods[11] = s_2_6;
//
// scalarprods[12] = s_3_6;
//
// scalarprods[13] = s_4_6;
//
// scalarprods[14] = s_5_6;
//
//
scalarprods[0] = hh[(ldh+1)];
//
scalarprods[1] = hh[(ldh*2)+2];
//
scalarprods[2] = hh[(ldh*2)+1];
//
scalarprods[3] = hh[(ldh*3)+3];
//
scalarprods[4] = hh[(ldh*3)+2];
//
scalarprods[5] = hh[(ldh*3)+1];
//
scalarprods[6] = hh[(ldh*4)+4];
//
scalarprods[7] = hh[(ldh*4)+3];
//
scalarprods[8] = hh[(ldh*4)+2];
//
scalarprods[9] = hh[(ldh*4)+1];
//
scalarprods[10] = hh[(ldh*5)+5];
//
scalarprods[11] = hh[(ldh*5)+4];
//
scalarprods[12] = hh[(ldh*5)+3];
//
scalarprods[13] = hh[(ldh*5)+2];
//
scalarprods[14] = hh[(ldh*5)+1];
//
//
// calculate scalar product of first and fourth householder vector
//
// loop counter = 2
//
scalarprods[0] += hh[1] * hh[(2+ldh)];
//
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
//
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
//
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
//
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
//
//
// loop counter = 3
//
scalarprods[0] += hh[2] * hh[(3+ldh)];
//
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
//
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
//
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
//
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
//