Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
207febce
Commit
207febce
authored
Sep 13, 2016
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Comment unused code paths in AVX kernels
parent
f581320b
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
349 additions
and
347 deletions
+349
-347
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
+136
-136
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
+59
-57
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
+154
-154
No files found.
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
View file @
207febce
...
...
@@ -272,145 +272,145 @@ void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq,
//#endif
}
#if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 6 householder vectors simultaneously
double scalarprods[15];
// scalarprods[0] = s_1_2;
// scalarprods[1] = s_1_3;
// scalarprods[2] = s_2_3;
// scalarprods[3] = s_1_4;
// scalarprods[4] = s_2_4;
// scalarprods[5] = s_3_4;
// scalarprods[6] = s_1_5;
// scalarprods[7] = s_2_5;
// scalarprods[8] = s_3_5;
// scalarprods[9] = s_4_5;
// scalarprods[10] = s_1_6;
// scalarprods[11] = s_2_6;
// scalarprods[12] = s_3_6;
// scalarprods[13] = s_4_6;
// scalarprods[14] = s_5_6;
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// Production level kernel calls with padding
//#ifdef __AVX__
for (i = 0; i < nq; i+=8)
{
hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
//#else
// for (i = 0; i < nq; i+=4)
//#if 0
//void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//{
// int i;
// int nb = *pnb;
// int nq = *pldq;
// int ldq = *pldq;
// int ldh = *pldh;
//
// // calculating scalar products to compute
// // 6 householder vectors simultaneously
// double scalarprods[15];
//
//// scalarprods[0] = s_1_2;
//// scalarprods[1] = s_1_3;
//// scalarprods[2] = s_2_3;
//// scalarprods[3] = s_1_4;
//// scalarprods[4] = s_2_4;
//// scalarprods[5] = s_3_4;
//// scalarprods[6] = s_1_5;
//// scalarprods[7] = s_2_5;
//// scalarprods[8] = s_3_5;
//// scalarprods[9] = s_4_5;
//// scalarprods[10] = s_1_6;
//// scalarprods[11] = s_2_6;
//// scalarprods[12] = s_3_6;
//// scalarprods[13] = s_4_6;
//// scalarprods[14] = s_5_6;
//
// scalarprods[0] = hh[(ldh+1)];
// scalarprods[1] = hh[(ldh*2)+2];
// scalarprods[2] = hh[(ldh*2)+1];
// scalarprods[3] = hh[(ldh*3)+3];
// scalarprods[4] = hh[(ldh*3)+2];
// scalarprods[5] = hh[(ldh*3)+1];
// scalarprods[6] = hh[(ldh*4)+4];
// scalarprods[7] = hh[(ldh*4)+3];
// scalarprods[8] = hh[(ldh*4)+2];
// scalarprods[9] = hh[(ldh*4)+1];
// scalarprods[10] = hh[(ldh*5)+5];
// scalarprods[11] = hh[(ldh*5)+4];
// scalarprods[12] = hh[(ldh*5)+3];
// scalarprods[13] = hh[(ldh*5)+2];
// scalarprods[14] = hh[(ldh*5)+1];
//
// // calculate scalar product of first and fourth householder vector
// // loop counter = 2
// scalarprods[0] += hh[1] * hh[(2+ldh)];
// scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
//
// // loop counter = 3
// scalarprods[0] += hh[2] * hh[(3+ldh)];
// scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
//
// scalarprods[1] += hh[1] * hh[3+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
//
// // loop counter = 4
// scalarprods[0] += hh[3] * hh[(4+ldh)];
// scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
//
// scalarprods[1] += hh[2] * hh[4+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
//
// scalarprods[3] += hh[1] * hh[4+(ldh*3)];
// scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
//
// // loop counter = 5
// scalarprods[0] += hh[4] * hh[(5+ldh)];
// scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
//
// scalarprods[1] += hh[3] * hh[5+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
//
// scalarprods[3] += hh[2] * hh[5+(ldh*3)];
// scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
//
// scalarprods[6] += hh[1] * hh[5+(ldh*4)];
// scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
//
// #pragma ivdep
// for (i = 6; i < nb; i++)
// {
// hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
// scalarprods[0] += hh[i-1] * hh[(i+ldh)];
// scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
// scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
// scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
// scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
//
// scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
// scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
// scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
// scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
//
// scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
// scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
// scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
//
// scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
// scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
//
// scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
// }
//
//
// // Production level kernel calls with padding
////#ifdef __AVX__
// for (i = 0; i < nq; i+=8)
// {
// hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
// }
////#else
//// for (i = 0; i < nq; i+=4)
//// {
//// hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
//// }
////#endif
//}
//#endif
}
#endif
/**
* Unrolled kernel that computes
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
View file @
207febce
...
...
@@ -163,65 +163,67 @@ void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int*
}
}
#if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 4 householder vectors simultaneously
double s_1_2 = hh[(ldh)+1];
double s_1_3 = hh[(ldh*2)+2];
double s_2_3 = hh[(ldh*2)+1];
double s_1_4 = hh[(ldh*3)+3];
double s_2_4 = hh[(ldh*3)+2];
double s_3_4 = hh[(ldh*3)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
s_1_2 += hh[2-1] * hh[(2+ldh)];
s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
// loop counter = 3
s_1_2 += hh[3-1] * hh[(3+ldh)];
s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
s_1_3 += hh[3-2] * hh[3+(ldh*2)];
s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
#pragma ivdep
for (i = 4; i < nb; i++)
{
s_1_2 += hh[i-1] * hh[(i+ldh)];
s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
s_1_3 += hh[i-2] * hh[i+(ldh*2)];
s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
//#if 0
//void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//{
// int i;
// int nb = *pnb;
// int nq = *pldq;
// int ldq = *pldq;
// int ldh = *pldh;
//
// // calculating scalar products to compute
// // 4 householder vectors simultaneously
// double s_1_2 = hh[(ldh)+1];
// double s_1_3 = hh[(ldh*2)+2];
// double s_2_3 = hh[(ldh*2)+1];
// double s_1_4 = hh[(ldh*3)+3];
// double s_2_4 = hh[(ldh*3)+2];
// double s_3_4 = hh[(ldh*3)+1];
//
// // calculate scalar product of first and fourth householder vector
// // loop counter = 2
// s_1_2 += hh[2-1] * hh[(2+ldh)];
// s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
// s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
//
// // loop counter = 3
// s_1_2 += hh[3-1] * hh[(3+ldh)];
// s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
// s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
//
// s_1_3 += hh[3-2] * hh[3+(ldh*2)];
// s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
//
// #pragma ivdep
// for (i = 4; i < nb; i++)
// {
// s_1_2 += hh[i-1] * hh[(i+ldh)];
// s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
// s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
//
// s_1_3 += hh[i-2] * hh[i+(ldh*2)];
// s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
//
// s_1_4 += hh[i-3] * hh[i+(ldh*3)];
// }
//
// // Production level kernel calls with padding
//#ifdef __AVX__
// for (i = 0; i < nq; i+=12)
// {
// hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
// }
//#else
// for (i = 0; i < nq; i+=6)
// {
// hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
// }
//#endif
//}
//#endif
s_1_4 += hh[i-3] * hh[i+(ldh*3)];
}
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq; i+=12)
{
hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#else
for (i = 0; i < nq; i+=6)
{
hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
#endif
}
#endif
/**
* Unrolled kernel that computes
* 6 rows of Q simultaneously, a
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
View file @
207febce
...
...
@@ -243,160 +243,160 @@ void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int*
}
}
#if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 6 householder vectors simultaneously
double scalarprods[15];
// scalarprods[0] = s_1_2;
// scalarprods[1] = s_1_3;
// scalarprods[2] = s_2_3;
// scalarprods[3] = s_1_4;
// scalarprods[4] = s_2_4;
// scalarprods[5] = s_3_4;
// scalarprods[6] = s_1_5;
// scalarprods[7] = s_2_5;
// scalarprods[8] = s_3_5;
// scalarprods[9] = s_4_5;
// scalarprods[10] = s_1_6;
// scalarprods[11] = s_2_6;
// scalarprods[12] = s_3_6;
// scalarprods[13] = s_4_6;
// scalarprods[14] = s_5_6;
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// printf("s_1_2: %f\n", scalarprods[0]);
// printf("s_1_3: %f\n", scalarprods[1]);
// printf("s_2_3: %f\n", scalarprods[2]);
// printf("s_1_4: %f\n", scalarprods[3]);
// printf("s_2_4: %f\n", scalarprods[4]);
// printf("s_3_4: %f\n", scalarprods[5]);
// printf("s_1_5: %f\n", scalarprods[6]);
// printf("s_2_5: %f\n", scalarprods[7]);
// printf("s_3_5: %f\n", scalarprods[8]);
// printf("s_4_5: %f\n", scalarprods[9]);
// printf("s_1_6: %f\n", scalarprods[10]);
// printf("s_2_6: %f\n", scalarprods[11]);
// printf("s_3_6: %f\n", scalarprods[12]);
// printf("s_4_6: %f\n", scalarprods[13]);
// printf("s_5_6: %f\n", scalarprods[14]);
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq; i+=8)
{
hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#else
for (i = 0; i < nq; i+=4)
{
hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#endif
}
#endif
//
#if 0
//
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
//
{
//
int i;
//
int nb = *pnb;
//
int nq = *pldq;
//
int ldq = *pldq;
//
int ldh = *pldh;
//
//
// calculating scalar products to compute
//
// 6 householder vectors simultaneously
//
double scalarprods[15];
//
//
//
scalarprods[0] = s_1_2;
//
//
scalarprods[1] = s_1_3;
//
//
scalarprods[2] = s_2_3;
//
//
scalarprods[3] = s_1_4;
//
//
scalarprods[4] = s_2_4;
//
//
scalarprods[5] = s_3_4;
//
//
scalarprods[6] = s_1_5;
//
//
scalarprods[7] = s_2_5;
//
//
scalarprods[8] = s_3_5;
//
//
scalarprods[9] = s_4_5;
//
//
scalarprods[10] = s_1_6;
//
//
scalarprods[11] = s_2_6;
//
//
scalarprods[12] = s_3_6;
//
//
scalarprods[13] = s_4_6;
//
//
scalarprods[14] = s_5_6;
//
//
scalarprods[0] = hh[(ldh+1)];
//
scalarprods[1] = hh[(ldh*2)+2];
//
scalarprods[2] = hh[(ldh*2)+1];
//
scalarprods[3] = hh[(ldh*3)+3];
//
scalarprods[4] = hh[(ldh*3)+2];
//
scalarprods[5] = hh[(ldh*3)+1];
//
scalarprods[6] = hh[(ldh*4)+4];
//
scalarprods[7] = hh[(ldh*4)+3];
//
scalarprods[8] = hh[(ldh*4)+2];
//
scalarprods[9] = hh[(ldh*4)+1];
//
scalarprods[10] = hh[(ldh*5)+5];
//
scalarprods[11] = hh[(ldh*5)+4];
//
scalarprods[12] = hh[(ldh*5)+3];
//
scalarprods[13] = hh[(ldh*5)+2];
//
scalarprods[14] = hh[(ldh*5)+1];
//
//
// calculate scalar product of first and fourth householder vector
//
// loop counter = 2
//
scalarprods[0] += hh[1] * hh[(2+ldh)];
//
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
//
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
//
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
//
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
//
//
// loop counter = 3
//
scalarprods[0] += hh[2] * hh[(3+ldh)];
//
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
//
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
//
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
//
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
//
//
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
//
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
//
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
//
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
//
//
// loop counter = 4
//
scalarprods[0] += hh[3] * hh[(4+ldh)];
//
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
//
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
//
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
//
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
//
//
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
//
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
//
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
//
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
//
//
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
//
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
//
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
//
//
// loop counter = 5
//
scalarprods[0] += hh[4] * hh[(5+ldh)];
//
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];