Commit f4495fb0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Check error in AVX/AVX2 kernels

parent 72700e47
...@@ -64,6 +64,7 @@ ...@@ -64,6 +64,7 @@
#include <complex.h> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex ...@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex
} }
#endif #endif
if (worked_on != nq) { if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 1 kernel \n"); //printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//abort();
} }
} }
......
...@@ -64,6 +64,7 @@ ...@@ -64,6 +64,7 @@
#include <complex.h> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex ...@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
} }
#endif #endif
if (worked_on != nq) { if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 2 kernel \n"); //printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//abort();
} }
} }
......
...@@ -62,6 +62,8 @@ ...@@ -62,6 +62,8 @@
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -142,7 +144,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int ...@@ -142,7 +144,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
//Forward declaration //Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
...@@ -194,6 +195,9 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* ...@@ -194,6 +195,9 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 4 householder vectors simultaneously // 4 householder vectors simultaneously
...@@ -246,41 +250,58 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* ...@@ -246,41 +250,58 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
for (i = 0; i < nq-8; i+=12) for (i = 0; i < nq-8; i+=12)
{ {
hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-16; i+=24) for (i = 0; i < nq-16; i+=24)
{ {
hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
} }
#endif #endif
if (nq == i) if (nq == i)
{ {
return; return;
} }
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
else if (nq-i == 8)
{ {
if (nq-i > 4) hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
{ worked_on += 8;
hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
else
{
hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 16) if (nq-i == 16)
{ {
hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 16;
} }
else #endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{ {
hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 4;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 8;
}
#endif
if (worked_on != nq)
{
//printf("Error in real AVX/AVX2 BLOCK4 kernel \n");
//abort();
}
} }
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
...@@ -1439,6 +1460,8 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, ...@@ -1439,6 +1460,8 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
_AVX_STORE(&q[(nb+2)*ldq],q1); _AVX_STORE(&q[(nb+2)*ldq],q1);
} }
#if 0
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
...@@ -1691,4 +1714,4 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb, ...@@ -1691,4 +1714,4 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb,
// _AVX_STORE(&q[(nb+2)*ldq],q1); // _AVX_STORE(&q[(nb+2)*ldq],q1);
} }
#endif /* SINGLE_PRECISION_REAL */ #endif /* SINGLE_PRECISION_REAL */
#endif
...@@ -63,6 +63,8 @@ ...@@ -63,6 +63,8 @@
#include "config-f90.h" #include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
...@@ -142,7 +144,6 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int ...@@ -142,7 +144,6 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
//Forward declaration //Forward declaration
static void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
...@@ -193,6 +194,9 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int* ...@@ -193,6 +194,9 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
int nq = *pldq; int nq = *pldq;
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute // calculating scalar products to compute
// 6 householder vectors simultaneously // 6 householder vectors simultaneously
...@@ -303,27 +307,39 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int* ...@@ -303,27 +307,39 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
for (i = 0; i < nq-4; i+=8) for (i = 0; i < nq-4; i+=8)
{ {
hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
} }
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-8; i+=16) for (i = 0; i < nq-8; i+=16)
{ {
hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
} }
#endif #endif
if (nq == i) if (nq == i)
{ {
return; return;
} }
else
{
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
}
#endif #endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif worked_on += 8;
} }
#endif
if (worked_on != nq)
{
//printf("Error in real AVX/AVX2 BLOCK6 kernel \n");
//abort();
}
} }
/** /**
...@@ -1687,6 +1703,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, ...@@ -1687,6 +1703,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
_AVX_STORE(&q[(nb+4)*ldq],q1); _AVX_STORE(&q[(nb+4)*ldq],q1);
} }
#if 0
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
...@@ -2257,3 +2274,4 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, ...@@ -2257,3 +2274,4 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
_mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1)); _mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1));
} }
#endif #endif
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment