Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
f4495fb0
Commit
f4495fb0
authored
Aug 02, 2017
by
Andreas Marek
Browse files
Check error in AVX/AVX2 kernels
parent
72700e47
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/kernels/complex_avx-avx2_1hv_template.Xc
View file @
f4495fb0
...
...
@@ -64,6 +64,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex
}
#endif
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//abort();
}
}
...
...
src/elpa2/kernels/complex_avx-avx2_2hv_template.Xc
View file @
f4495fb0
...
...
@@ -64,6 +64,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
}
#endif
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//abort();
}
}
...
...
src/elpa2/kernels/real_avx-avx2_2hv_template.Xc
View file @
f4495fb0
...
...
@@ -63,6 +63,8 @@
#include "config-f90.h"
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
...
...
@@ -123,16 +125,20 @@
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -185,6 +191,9 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute
// 2 householder vectors simultaneously
...
...
@@ -201,75 +210,121 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24)
{
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
for (i = 0; i < nq-40; i+=48)
{
hh_trafo_kernel_48_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 20)
{
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s)
;
hh_trafo_kernel_4_AVX_2hv_double(&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_kernel_20_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 20
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+16], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 16)
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 40)
{
hh_trafo_kernel_40_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 40;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 12)
if (nq-i == 32)
{
hh_trafo_kernel_32_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 32;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i+8], hh, nb, ldq, ldh, s);
if (nq-i == 12)
{
hh_trafo_kernel_12_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+8], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 8)
if (nq-i == 24)
{
hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
}
else
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
if (worked_on != nq)
{
//printf("Error in real avx/avx2 BLOCK 2 kernel \n");
//abort();
}
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 24 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 48 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
...
...
@@ -277,7 +332,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_
2
4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
__forceinline void hh_trafo_kernel_4
8
_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
...
...
@@ -296,11 +351,9 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
__AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
__AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
__AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
__AVX_DATATYPE x6 = _AVX_LOAD(&q[ldq+5*offset]);
#endif
__AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
__AVX_DATATYPE h2;
...
...
@@ -312,14 +365,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
__AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
__AVX_DATATYPE y6 = _AVX_FMA(x6, h1, q6);
#endif
#else
__AVX_DATATYPE q1 = _AVX_LOAD(q);
...
...
@@ -328,14 +379,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
__AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
__AVX_DATATYPE y6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
#endif
#endif
...
...
@@ -353,7 +402,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
y3 = _AVX_FMA(q3, h2, y3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
y4 = _AVX_FMA(q4, h2, y4);
...
...
@@ -363,7 +411,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
x6 = _AVX_FMA(q6, h1, x6);
y6 = _AVX_FMA(q6, h2, y6);
#endif
#else
q1 = _AVX_LOAD(&q[i*ldq]);
...
...
@@ -375,7 +422,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
...
...
@@ -385,7 +431,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
y6 = _AVX_ADD(y6, _AVX_MUL(q6,h2));
#endif
#endif
}
...
...
@@ -398,14 +443,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x2 = _AVX_FMA(q2, h1, x2);
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
x6 = _AVX_FMA(q6, h1, x6);
#endif
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
...
...
@@ -414,14 +457,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
#endif
#endif
...
...
@@ -437,32 +478,27 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x1 = _AVX_MUL(x1, h1);
x2 = _AVX_MUL(x2, h1);
x3 = _AVX_MUL(x3, h1);
#ifdef DOUBLE_PRECISION_REAL
x4 = _AVX_MUL(x4, h1);
x5 = _AVX_MUL(x5, h1);
x6 = _AVX_MUL(x6, h1);
#endif
h1 = _AVX_XOR(tau2, sign);
h2 = _AVX_MUL(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
#ifdef DOUBLE_PRECISION_REAL
y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
y6 = _AVX_FMA(y6, h1, _AVX_MUL(x6,h2));
#endif
#else
y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
#ifdef DOUBLE_PRECISION_REAL
y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
y6 = _AVX_ADD(_AVX_MUL(y6,h1), _AVX_MUL(x6,h2));
#endif
#endif
...
...
@@ -475,7 +511,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[2*offset]);
q3 = _AVX_ADD(q3, y3);
_AVX_STORE(&q[2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[3*offset]);
q4 = _AVX_ADD(q4, y4);
_AVX_STORE(&q[3*offset],q4);
...
...
@@ -485,7 +520,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[5*offset]);
q6 = _AVX_ADD(q6, y6);
_AVX_STORE(&q[5*offset],q6);
#endif
h2 = _AVX_BROADCAST(&hh[ldh+1]);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_LOAD(&q[ldq]);
...
...
@@ -497,7 +532,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[ldq+2*offset]);
q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
_AVX_STORE(&q[ldq+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[ldq+3*offset]);
q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
_AVX_STORE(&q[ldq+3*offset],q4);
...
...
@@ -507,7 +541,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[ldq+5*offset]);
q6 = _AVX_ADD(q6, _AVX_FMA(y6, h2, x6));
_AVX_STORE(&q[ldq+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[ldq]);
...
...
@@ -519,7 +552,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[ldq+2*offset]);
q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
_AVX_STORE(&q[ldq+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[ldq+3*offset]);
q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
_AVX_STORE(&q[ldq+3*offset],q4);
...
...
@@ -529,7 +561,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[ldq+5*offset]);
q6 = _AVX_ADD(q6, _AVX_ADD(x6, _AVX_MUL(y6, h2)));
_AVX_STORE(&q[ldq+5*offset],q6);
#endif
#endif
...
...
@@ -550,7 +581,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_FMA(x3, h1, q3);
q3 = _AVX_FMA(y3, h2, q3);
_AVX_STORE(&q[(i*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
q4 = _AVX_FMA(x4, h1, q4);
q4 = _AVX_FMA(y4, h2, q4);
...
...
@@ -563,7 +593,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_FMA(x6, h1, q6);
q6 = _AVX_FMA(y6, h2, q6);
_AVX_STORE(&q[(i*ldq)+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[i*ldq]);
...
...
@@ -575,7 +604,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
_AVX_STORE(&q[(i*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
_AVX_STORE(&q[(i*ldq)+3*offset],q4);
...
...
@@ -585,7 +613,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
q6 = _AVX_ADD(q6, _AVX_ADD(_AVX_MUL(x6,h1), _AVX_MUL(y6, h2)));
_AVX_STORE(&q[(i*ldq)+5*offset],q6);
#endif
#endif
}
...
...
@@ -601,7 +628,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX_FMA(x3, h1, q3);
_AVX_STORE(&q[(nb*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX_FMA(x4, h1, q4);
_AVX_STORE(&q[(nb*ldq)+3*offset],q4);
...
...
@@ -611,7 +637,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
q6 = _AVX_FMA(x6, h1, q6);
_AVX_STORE(&q[(nb*ldq)+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
...
...
@@ -623,7 +648,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
_AVX_STORE(&q[(nb*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
_AVX_STORE(&q[(nb*ldq)+3*offset],q4);
...
...
@@ -633,26 +657,31 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
q6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
_AVX_STORE(&q[(nb*ldq)+5*offset],q6);
#endif
#endif
}
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
#ifdef DOUBLE_PRECISION_REAL
* 20 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 40 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_
16
_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_
20
_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_
16
_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
__forceinline void hh_trafo_kernel_
40
_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [
16
x nb+1] * hh
// Matrix Vector Multiplication, Q [
24
x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
...
...
@@ -666,10 +695,10 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
__AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
__AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
#endif
__AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
__AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
__AVX_DATATYPE h2;
...
...
@@ -678,24 +707,24 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
__AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
__AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
#endif
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
#else
__AVX_DATATYPE q1 = _AVX_LOAD(q);
__AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
__AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
__AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
#endif
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
#endif
...
...
@@ -710,14 +739,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX_FMA(q2, h1, x2);
y2 = _AVX_FMA(q2, h2, y2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
y3 = _AVX_FMA(q3, h2, y3);
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
y4 = _AVX_FMA(q4, h2, y4);
#endif
q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
y5 = _AVX_FMA(q5, h2, y5);
#else
q1 = _AVX_LOAD(&q[i*ldq]);
...
...
@@ -726,14 +756,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
#endif
q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
#endif
}
...
...
@@ -744,29 +775,29 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x1 = _AVX_FMA(q1, h1, x1);
q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX_FMA(q2, h1, x2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
#endif
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
#endif
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
#endif
/////////////////////////////////////////////////////
// Rank-2 update of Q [
16
x nb+1]
// Rank-2 update of Q [
24
x nb+1]
/////////////////////////////////////////////////////
__AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
...
...
@@ -776,27 +807,25 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
h1 = _AVX_XOR(tau1, sign);
x1 = _AVX_MUL(x1, h1);
x2 = _AVX_MUL(x2, h1);
#ifdef DOUBLE_PRECISION_REAL
x3 = _AVX_MUL(x3, h1);
x4 = _AVX_MUL(x4, h1);
#endif
x5 = _AVX_MUL(x5, h1);
h1 = _AVX_XOR(tau2, sign);
h2 = _AVX_MUL(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
#ifdef DOUBLE_PRECISION_REAL
y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
#endif
y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
#else
y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
#ifdef DOUBLE_PRECISION_REAL
y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
#endif
y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
#endif
...
...
@@ -806,14 +835,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[offset]);
q2 = _AVX_ADD(q2, y2);
_AVX_STORE(&q[offset],q2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[2*offset]);
q3 = _AVX_ADD(q3, y3);
_AVX_STORE(&q[2*offset],q3);
q4 = _AVX_LOAD(&q[3*offset]);