Commit ee4e6cab authored by Andreas Marek's avatar Andreas Marek
Browse files

Check in AVX-512 kernels

parent 4ccc4ad5
...@@ -51,6 +51,7 @@ ...@@ -51,6 +51,7 @@
#include <complex.h> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -269,7 +270,8 @@ void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* ...@@ -269,7 +270,8 @@ void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex*
#endif #endif
if (worked_on != nq) if (worked_on != nq)
{ {
printf("Error in complex AVX512 BLOCK 1 kernel \n"); // printf("Error in complex AVX512 BLOCK 1 kernel \n");
// abort();
} }
} }
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include <complex.h> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -193,25 +194,29 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* ...@@ -193,25 +194,29 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex*
return; return;
} }
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 12 ) { if (nq-i == 12 )
{
hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_12_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 12; worked_on += 12;
} }
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 24 ) { if (nq-i == 24 )
{
hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_24_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24; worked_on += 24;
} }
#endif #endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 8 ) { if (nq-i == 8 )
{
hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8; worked_on += 8;
} }
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 16 ) { if (nq-i == 16 )
{
hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16; worked_on += 16;
} }
...@@ -235,7 +240,8 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex* ...@@ -235,7 +240,8 @@ void double_hh_trafo_complex_avx512_2hv_single(float complex* q, float complex*
if (worked_on != nq) if (worked_on != nq)
{ {
printf("Error in complex AVX512 BLOCK 2 kernel \n"); // printf("Error in complex AVX512 BLOCK 2 kernel \n");
// abort();
} }
} }
......
...@@ -48,6 +48,8 @@ ...@@ -48,6 +48,8 @@
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -151,6 +153,9 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -151,6 +153,9 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar product to compute // calculating scalar product to compute
// 2 householder vectors simultaneously // 2 householder vectors simultaneously
...@@ -171,12 +176,14 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -171,12 +176,14 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
for (i = 0; i < nq-24; i+=32) for (i = 0; i < nq-24; i+=32)
{ {
hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-48; i+=64) for (i = 0; i < nq-48; i+=64)
{ {
hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_64_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += i;
} }
#endif #endif
if (nq == i) if (nq == i)
...@@ -184,40 +191,52 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -184,40 +191,52 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
return; return;
} }
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 24) if (nq-i == 24)
{ {
hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 24;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 48) if (nq-i == 48)
{ {
hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_48_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 48;
} }
#endif #endif
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
else if (nq-i == 16) if (nq-i == 16)
{ {
hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
else if (nq-i == 32) if (nq-i == 32)
{ {
hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_32_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 32;
} }
#endif #endif
else
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{ {
hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
worked_on += 8;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{ {
hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_16_AVX512_2hv_single(&q[i], hh, nb, ldq, ldh, s);
worked_on += 16;
} }
#endif #endif
if (worked_on != nq)
{
// printf("Error in AVX512 real BLOCK 2 kernel \n");
// abort();
}
} }
/** /**
......
...@@ -46,7 +46,8 @@ ...@@ -46,7 +46,8 @@
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -104,10 +105,10 @@ void quad_hh_trafo_real_avx512_4hv_double(double* q, double* hh, int* pnb, int* ...@@ -104,10 +105,10 @@ void quad_hh_trafo_real_avx512_4hv_double(double* q, double* hh, int* pnb, int*
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
//Forward declaration //Forward declaration
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_32_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_48_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
...@@ -157,6 +158,9 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -157,6 +158,9 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 4 householder vectors simultaneously // 4 householder vectors simultaneously
...@@ -210,12 +214,14 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -210,12 +214,14 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn
for (i = 0; i < nq-24; i+=32) for (i = 0; i < nq-24; i+=32)
{ {
hh_trafo_kernel_32_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_32_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-32; i+=48) for (i = 0; i < nq-48; i+=64)
{ {
hh_trafo_kernel_48_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_64_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
} }
#endif #endif
if (nq == i) if (nq == i)
...@@ -226,41 +232,77 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -226,41 +232,77 @@ void quad_hh_trafo_real_avx512_4hv_single(float* q, float* hh, int* pnb, int* pn
if (nq-i == 24) if (nq-i == 24)
{ {
hh_trafo_kernel_24_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_24_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 24;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 48)
{
hh_trafo_kernel_48_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 48;
} }
#endif
else #ifdef DOUBLE_PRECISION_REAL
{ if (nq-i == 16)
if (nq-i == 16) {
{
hh_trafo_kernel_16_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_16_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
} worked_on += 16;
else }
{
hh_trafo_kernel_8_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
}
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 32) if (nq-i == 32)
{ {
hh_trafo_kernel_32_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_32_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
} worked_on += 32;
else }
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{ {
hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_8_AVX512_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 8;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 16;
}
#endif
if (worked_on != nq)
{
// printf("Error in AVX512 real BLOCK 2 kernel \n");
// abort();
}
} }
#ifdef DOUBLE_PRECISION_REAL
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 32 rows of Q simultaneously, a * 32 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 64 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder * matrix Vector product with two householder
* vectors + a rank 1 update is performed * vectors + a rank 1 update is performed
*/ */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+3] * hh // Matrix Vector Multiplication, Q [4 x nb+3] * hh
...@@ -691,7 +733,6 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, i ...@@ -691,7 +733,6 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, i
_AVX512_STORE(&q[((nb+2)*ldq)+3*offset],q4); _AVX512_STORE(&q[((nb+2)*ldq)+3*offset],q4);
} }
#endif /* DOUBLE_PRECISION_REAL */
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
...@@ -708,7 +749,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, i ...@@ -708,7 +749,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_double(double* q, double* hh, i
__forceinline void hh_trafo_kernel_24_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) __forceinline void hh_trafo_kernel_24_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_48_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif #endif
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
...@@ -1089,7 +1130,7 @@ __forceinline void hh_trafo_kernel_48_AVX_4hv_single(float* q, float* hh, int nb ...@@ -1089,7 +1130,7 @@ __forceinline void hh_trafo_kernel_48_AVX_4hv_single(float* q, float* hh, int nb
__forceinline void hh_trafo_kernel_16_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) __forceinline void hh_trafo_kernel_16_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_32_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif #endif
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
...@@ -1380,7 +1421,7 @@ __forceinline void hh_trafo_kernel_32_AVX_4hv_single(float* q, float* hh, int nb ...@@ -1380,7 +1421,7 @@ __forceinline void hh_trafo_kernel_32_AVX_4hv_single(float* q, float* hh, int nb
__forceinline void hh_trafo_kernel_8_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) __forceinline void hh_trafo_kernel_8_AVX512_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4) __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif #endif
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
......
...@@ -48,6 +48,7 @@ ...@@ -48,6 +48,7 @@
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -113,6 +114,8 @@ void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* ...@@ -113,6 +114,8 @@ void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
...@@ -162,6 +165,9 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -162,6 +165,9 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 6 householder vectors simultaneously // 6 householder vectors simultaneously
...@@ -273,12 +279,14 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -273,12 +279,14 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
for (i = 0; i < nq-24; i+=32) for (i = 0; i < nq-24; i+=32)
{ {
hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-16; i+=32) for (i = 0; i < nq-48; i+=64)
{ {
hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
} }
#endif #endif
if (nq == i) if (nq == i)
...@@ -289,26 +297,54 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -289,26 +297,54 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
if (nq-i == 24) if (nq-i == 24)
{ {
hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 24;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i ==48)
{
hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 48;
} }
else #endif
{
if (nq-i == 16) #ifdef DOUBLE_PRECISION_REAL
{ if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
} worked_on += 16;
else }
{ #endif
hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
} #ifdef SINGLE_PRECISION_REAL
if (nq-i ==32)
{
hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 32;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 8;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 16) { if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
} else { worked_on += 16;
printf("ERROR in avx512 kernel\n"); }
}
#endif #endif
if (worked_on != nq)
{
// printf("ERROR in avx512 kernel\n");
// abort();
}
} }
/** /**
...@@ -319,7 +355,6 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -319,7 +355,6 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
* 16 rows of Q simultaneously, a * 16 rows of Q simultaneously, a
#endif #endif
* matrix Vector product with two householder * matrix Vector product with two householder
* vectors + a rank 1 update is performed * vectors + a rank 1 update is performed
*/ */
...@@ -1349,14 +1384,24 @@ __forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int ...@@ -1349,14 +1384,24 @@ __forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int
} }
#ifdef DOUBLE_PRECISION_REAL
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 24 rows of Q simultaneously, a * 24 rows of Q simultaneously, a