Commit f4495fb0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Check error in AVX/AVX2 kernels

parent 72700e47
......@@ -64,6 +64,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
......@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex
}
#endif
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//abort();
}
}
......
......@@ -64,6 +64,7 @@
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
......@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
}
#endif
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//abort();
}
}
......
......@@ -63,6 +63,8 @@
#include "config-f90.h"
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
......@@ -123,16 +125,20 @@
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_32_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif
#ifdef DOUBLE_PRECISION_REAL
......@@ -185,6 +191,9 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute
// 2 householder vectors simultaneously
......@@ -201,75 +210,121 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-20; i+=24)
{
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_24_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
for (i = 0; i < nq-40; i+=48)
{
hh_trafo_kernel_48_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 20)
{
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_kernel_20_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 20;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+16], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 16)
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 40)
{
hh_trafo_kernel_40_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 40;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 12)
if (nq-i == 32)
{
hh_trafo_kernel_32_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 32;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_double(&q[i+8], hh, nb, ldq, ldh, s);
if (nq-i == 12)
{
hh_trafo_kernel_12_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+8], hh, nb, ldq, ldh, s);
#endif
}
else if (nq-i == 8)
if (nq-i == 24)
{
hh_trafo_kernel_24_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
#endif
}
else
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_AVX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
hh_trafo_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
}
#endif
if (worked_on != nq)
{
//printf("Error in real avx/avx2 BLOCK 2 kernel \n");
//abort();
}
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 24 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 48 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
......@@ -277,7 +332,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
__forceinline void hh_trafo_kernel_48_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
......@@ -296,11 +351,9 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
__AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
__AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
__AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
__AVX_DATATYPE x6 = _AVX_LOAD(&q[ldq+5*offset]);
#endif
__AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
__AVX_DATATYPE h2;
......@@ -312,14 +365,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
__AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
__AVX_DATATYPE y6 = _AVX_FMA(x6, h1, q6);
#endif
#else
__AVX_DATATYPE q1 = _AVX_LOAD(q);
......@@ -328,14 +379,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
__AVX_DATATYPE q6 = _AVX_LOAD(&q[5*offset]);
__AVX_DATATYPE y6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
#endif
#endif
......@@ -353,7 +402,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
y3 = _AVX_FMA(q3, h2, y3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
y4 = _AVX_FMA(q4, h2, y4);
......@@ -363,7 +411,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
x6 = _AVX_FMA(q6, h1, x6);
y6 = _AVX_FMA(q6, h2, y6);
#endif
#else
q1 = _AVX_LOAD(&q[i*ldq]);
......@@ -375,7 +422,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
......@@ -385,7 +431,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
y6 = _AVX_ADD(y6, _AVX_MUL(q6,h2));
#endif
#endif
}
......@@ -398,14 +443,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x2 = _AVX_FMA(q2, h1, x2);
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
x6 = _AVX_FMA(q6, h1, x6);
#endif
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
......@@ -414,14 +457,12 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
x6 = _AVX_ADD(x6, _AVX_MUL(q6,h1));
#endif
#endif
......@@ -437,32 +478,27 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x1 = _AVX_MUL(x1, h1);
x2 = _AVX_MUL(x2, h1);
x3 = _AVX_MUL(x3, h1);
#ifdef DOUBLE_PRECISION_REAL
x4 = _AVX_MUL(x4, h1);
x5 = _AVX_MUL(x5, h1);
x6 = _AVX_MUL(x6, h1);
#endif
h1 = _AVX_XOR(tau2, sign);
h2 = _AVX_MUL(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
#ifdef DOUBLE_PRECISION_REAL
y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
y6 = _AVX_FMA(y6, h1, _AVX_MUL(x6,h2));
#endif
#else
y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
#ifdef DOUBLE_PRECISION_REAL
y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
y6 = _AVX_ADD(_AVX_MUL(y6,h1), _AVX_MUL(x6,h2));
#endif
#endif
......@@ -475,7 +511,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[2*offset]);
q3 = _AVX_ADD(q3, y3);
_AVX_STORE(&q[2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[3*offset]);
q4 = _AVX_ADD(q4, y4);
_AVX_STORE(&q[3*offset],q4);
......@@ -485,7 +520,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[5*offset]);
q6 = _AVX_ADD(q6, y6);
_AVX_STORE(&q[5*offset],q6);
#endif
h2 = _AVX_BROADCAST(&hh[ldh+1]);
#ifdef __ELPA_USE_FMA__
q1 = _AVX_LOAD(&q[ldq]);
......@@ -497,7 +532,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[ldq+2*offset]);
q3 = _AVX_ADD(q3, _AVX_FMA(y3, h2, x3));
_AVX_STORE(&q[ldq+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[ldq+3*offset]);
q4 = _AVX_ADD(q4, _AVX_FMA(y4, h2, x4));
_AVX_STORE(&q[ldq+3*offset],q4);
......@@ -507,7 +541,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[ldq+5*offset]);
q6 = _AVX_ADD(q6, _AVX_FMA(y6, h2, x6));
_AVX_STORE(&q[ldq+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[ldq]);
......@@ -519,7 +552,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[ldq+2*offset]);
q3 = _AVX_ADD(q3, _AVX_ADD(x3, _AVX_MUL(y3, h2)));
_AVX_STORE(&q[ldq+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[ldq+3*offset]);
q4 = _AVX_ADD(q4, _AVX_ADD(x4, _AVX_MUL(y4, h2)));
_AVX_STORE(&q[ldq+3*offset],q4);
......@@ -529,7 +561,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[ldq+5*offset]);
q6 = _AVX_ADD(q6, _AVX_ADD(x6, _AVX_MUL(y6, h2)));
_AVX_STORE(&q[ldq+5*offset],q6);
#endif
#endif
......@@ -550,7 +581,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_FMA(x3, h1, q3);
q3 = _AVX_FMA(y3, h2, q3);
_AVX_STORE(&q[(i*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
q4 = _AVX_FMA(x4, h1, q4);
q4 = _AVX_FMA(y4, h2, q4);
......@@ -563,7 +593,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_FMA(x6, h1, q6);
q6 = _AVX_FMA(y6, h2, q6);
_AVX_STORE(&q[(i*ldq)+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[i*ldq]);
......@@ -575,7 +604,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
q3 = _AVX_ADD(q3, _AVX_ADD(_AVX_MUL(x3,h1), _AVX_MUL(y3, h2)));
_AVX_STORE(&q[(i*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
q4 = _AVX_ADD(q4, _AVX_ADD(_AVX_MUL(x4,h1), _AVX_MUL(y4, h2)));
_AVX_STORE(&q[(i*ldq)+3*offset],q4);
......@@ -585,7 +613,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(i*ldq)+5*offset]);
q6 = _AVX_ADD(q6, _AVX_ADD(_AVX_MUL(x6,h1), _AVX_MUL(y6, h2)));
_AVX_STORE(&q[(i*ldq)+5*offset],q6);
#endif
#endif
}
......@@ -601,7 +628,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX_FMA(x3, h1, q3);
_AVX_STORE(&q[(nb*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX_FMA(x4, h1, q4);
_AVX_STORE(&q[(nb*ldq)+3*offset],q4);
......@@ -611,7 +637,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
q6 = _AVX_FMA(x6, h1, q6);
_AVX_STORE(&q[(nb*ldq)+5*offset],q6);
#endif
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
......@@ -623,7 +648,6 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
_AVX_STORE(&q[(nb*ldq)+2*offset],q3);
#ifdef DOUBLE_PRECISION_REAL
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
_AVX_STORE(&q[(nb*ldq)+3*offset],q4);
......@@ -633,26 +657,31 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q6 = _AVX_LOAD(&q[(nb*ldq)+5*offset]);
q6 = _AVX_ADD(q6, _AVX_MUL(x6, h1));
_AVX_STORE(&q[(nb*ldq)+5*offset],q6);
#endif
#endif
}
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
#ifdef DOUBLE_PRECISION_REAL
* 20 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 40 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
__forceinline void hh_trafo_kernel_20_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
__forceinline void hh_trafo_kernel_40_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [16 x nb+1] * hh
// Matrix Vector Multiplication, Q [24 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
......@@ -666,10 +695,10 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE x1 = _AVX_LOAD(&q[ldq]);
__AVX_DATATYPE x2 = _AVX_LOAD(&q[ldq+offset]);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE x3 = _AVX_LOAD(&q[ldq+2*offset]);
__AVX_DATATYPE x4 = _AVX_LOAD(&q[ldq+3*offset]);
#endif
__AVX_DATATYPE x5 = _AVX_LOAD(&q[ldq+4*offset]);
__AVX_DATATYPE h1 = _AVX_BROADCAST(&hh[ldh+1]);
__AVX_DATATYPE h2;
......@@ -678,24 +707,24 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
__AVX_DATATYPE y1 = _AVX_FMA(x1, h1, q1);
__AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
__AVX_DATATYPE y2 = _AVX_FMA(x2, h1, q2);
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_FMA(x3, h1, q3);
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_FMA(x4, h1, q4);
#endif
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_FMA(x5, h1, q5);
#else
__AVX_DATATYPE q1 = _AVX_LOAD(q);
__AVX_DATATYPE y1 = _AVX_ADD(q1, _AVX_MUL(x1, h1));
__AVX_DATATYPE q2 = _AVX_LOAD(&q[offset]);
__AVX_DATATYPE y2 = _AVX_ADD(q2, _AVX_MUL(x2, h1));
#ifdef DOUBLE_PRECISION_REAL
__AVX_DATATYPE q3 = _AVX_LOAD(&q[2*offset]);
__AVX_DATATYPE y3 = _AVX_ADD(q3, _AVX_MUL(x3, h1));
__AVX_DATATYPE q4 = _AVX_LOAD(&q[3*offset]);
__AVX_DATATYPE y4 = _AVX_ADD(q4, _AVX_MUL(x4, h1));
#endif
__AVX_DATATYPE q5 = _AVX_LOAD(&q[4*offset]);
__AVX_DATATYPE y5 = _AVX_ADD(q5, _AVX_MUL(x5, h1));
#endif
......@@ -710,14 +739,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX_FMA(q2, h1, x2);
y2 = _AVX_FMA(q2, h2, y2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
y3 = _AVX_FMA(q3, h2, y3);
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
y4 = _AVX_FMA(q4, h2, y4);
#endif
q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
y5 = _AVX_FMA(q5, h2, y5);
#else
q1 = _AVX_LOAD(&q[i*ldq]);
......@@ -726,14 +756,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[(i*ldq)+offset]);
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
y2 = _AVX_ADD(y2, _AVX_MUL(q2,h2));
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(i*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
y3 = _AVX_ADD(y3, _AVX_MUL(q3,h2));
q4 = _AVX_LOAD(&q[(i*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
y4 = _AVX_ADD(y4, _AVX_MUL(q4,h2));
#endif
q5 = _AVX_LOAD(&q[(i*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
y5 = _AVX_ADD(y5, _AVX_MUL(q5,h2));
#endif
}
......@@ -744,29 +775,29 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
x1 = _AVX_FMA(q1, h1, x1);
q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX_FMA(q2, h1, x2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_FMA(q3, h1, x3);
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_FMA(q4, h1, x4);
#endif
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_FMA(q5, h1, x5);
#else
q1 = _AVX_LOAD(&q[nb*ldq]);
x1 = _AVX_ADD(x1, _AVX_MUL(q1,h1));
q2 = _AVX_LOAD(&q[(nb*ldq)+offset]);
x2 = _AVX_ADD(x2, _AVX_MUL(q2,h1));
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[(nb*ldq)+2*offset]);
x3 = _AVX_ADD(x3, _AVX_MUL(q3,h1));
q4 = _AVX_LOAD(&q[(nb*ldq)+3*offset]);
x4 = _AVX_ADD(x4, _AVX_MUL(q4,h1));
#endif
q5 = _AVX_LOAD(&q[(nb*ldq)+4*offset]);
x5 = _AVX_ADD(x5, _AVX_MUL(q5,h1));
#endif
/////////////////////////////////////////////////////
// Rank-2 update of Q [16 x nb+1]
// Rank-2 update of Q [24 x nb+1]
/////////////////////////////////////////////////////
__AVX_DATATYPE tau1 = _AVX_BROADCAST(hh);
......@@ -776,27 +807,25 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
h1 = _AVX_XOR(tau1, sign);
x1 = _AVX_MUL(x1, h1);
x2 = _AVX_MUL(x2, h1);
#ifdef DOUBLE_PRECISION_REAL
x3 = _AVX_MUL(x3, h1);
x4 = _AVX_MUL(x4, h1);
#endif
x5 = _AVX_MUL(x5, h1);
h1 = _AVX_XOR(tau2, sign);
h2 = _AVX_MUL(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _AVX_FMA(y1, h1, _AVX_MUL(x1,h2));
y2 = _AVX_FMA(y2, h1, _AVX_MUL(x2,h2));
#ifdef DOUBLE_PRECISION_REAL
y3 = _AVX_FMA(y3, h1, _AVX_MUL(x3,h2));
y4 = _AVX_FMA(y4, h1, _AVX_MUL(x4,h2));
#endif
y5 = _AVX_FMA(y5, h1, _AVX_MUL(x5,h2));
#else
y1 = _AVX_ADD(_AVX_MUL(y1,h1), _AVX_MUL(x1,h2));
y2 = _AVX_ADD(_AVX_MUL(y2,h1), _AVX_MUL(x2,h2));
#ifdef DOUBLE_PRECISION_REAL
y3 = _AVX_ADD(_AVX_MUL(y3,h1), _AVX_MUL(x3,h2));
y4 = _AVX_ADD(_AVX_MUL(y4,h1), _AVX_MUL(x4,h2));
#endif
y5 = _AVX_ADD(_AVX_MUL(y5,h1), _AVX_MUL(x5,h2));
#endif
......@@ -806,14 +835,15 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
q2 = _AVX_LOAD(&q[offset]);
q2 = _AVX_ADD(q2, y2);
_AVX_STORE(&q[offset],q2);
#ifdef DOUBLE_PRECISION_REAL
q3 = _AVX_LOAD(&q[2*offset]);
q3 = _AVX_ADD(q3, y3);