Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
f4495fb0
Commit
f4495fb0
authored
Aug 02, 2017
by
Andreas Marek
Browse files
Check error in AVX/AVX2 kernels
parent
72700e47
Changes
5
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
src/elpa2/kernels/complex_avx-avx2_1hv_template.Xc
View file @
f4495fb0
...
@@ -64,6 +64,7 @@
...
@@ -64,6 +64,7 @@
#include <complex.h>
#include <complex.h>
#include <x86intrin.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
#define __forceinline __attribute__((always_inline))
...
@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex
...
@@ -294,7 +295,8 @@ void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex
}
}
#endif
#endif
if (worked_on != nq) {
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 1 kernel \n");
//abort();
}
}
}
}
...
...
src/elpa2/kernels/complex_avx-avx2_2hv_template.Xc
View file @
f4495fb0
...
@@ -64,6 +64,7 @@
...
@@ -64,6 +64,7 @@
#include <complex.h>
#include <complex.h>
#include <x86intrin.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
#define __forceinline __attribute__((always_inline))
...
@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
...
@@ -269,7 +270,8 @@ void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex
}
}
#endif
#endif
if (worked_on != nq) {
if (worked_on != nq) {
printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//printf("Error in complex avx-avx2 BLOCK 2 kernel \n");
//abort();
}
}
}
}
...
...
src/elpa2/kernels/real_avx-avx2_2hv_template.Xc
View file @
f4495fb0
This diff is collapsed.
Click to expand it.
src/elpa2/kernels/real_avx-avx2_4hv_template.Xc
View file @
f4495fb0
...
@@ -62,6 +62,8 @@
...
@@ -62,6 +62,8 @@
#include "config-f90.h"
#include "config-f90.h"
#include <x86intrin.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#define __forceinline __attribute__((always_inline)) static
...
@@ -142,7 +144,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
...
@@ -142,7 +144,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
...
@@ -194,6 +195,9 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
...
@@ -194,6 +195,9 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
int nq = *pldq;
int nq = *pldq;
int ldq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute
// calculating scalar products to compute
// 4 householder vectors simultaneously
// 4 householder vectors simultaneously
...
@@ -246,41 +250,58 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
...
@@ -246,41 +250,58 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
for (i = 0; i < nq-8; i+=12)
for (i = 0; i < nq-8; i+=12)
{
{
hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_12_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
}
}
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-16; i+=24)
for (i = 0; i < nq-16; i+=24)
{
{
hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_24_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += i;
}
}
#endif
#endif
if (nq == i)
if (nq == i)
{
{
return;
return;
}
}
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
else
if (nq-i == 8)
{
{
if (nq-i > 4)
hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
{
worked_on += 8;
hh_trafo_kernel_8_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
else
{
hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
}
}
}
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 16)
if (nq-i == 16)
{
{
hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_16_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 16;
}
}
else
#endif
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
{
hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
hh_trafo_kernel_4_AVX_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 4;
}
}
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 8;
}
#endif
if (worked_on != nq)
{
//printf("Error in real AVX/AVX2 BLOCK4 kernel \n");
//abort();
}
}
}
/**
/**
* Unrolled kernel that computes
* Unrolled kernel that computes
...
@@ -1439,6 +1460,8 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
...
@@ -1439,6 +1460,8 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
_AVX_STORE(&q[(nb+2)*ldq],q1);
_AVX_STORE(&q[(nb+2)*ldq],q1);
}
}
#if 0
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
/**
/**
* Unrolled kernel that computes
* Unrolled kernel that computes
...
@@ -1691,4 +1714,4 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb,
...
@@ -1691,4 +1714,4 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb,
// _AVX_STORE(&q[(nb+2)*ldq],q1);
// _AVX_STORE(&q[(nb+2)*ldq],q1);
}
}
#endif /* SINGLE_PRECISION_REAL */
#endif /* SINGLE_PRECISION_REAL */
#endif
src/elpa2/kernels/real_avx-avx2_6hv_template.Xc
View file @
f4495fb0
...
@@ -63,6 +63,8 @@
...
@@ -63,6 +63,8 @@
#include "config-f90.h"
#include "config-f90.h"
#include <x86intrin.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#define __forceinline __attribute__((always_inline)) static
...
@@ -142,7 +144,6 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
...
@@ -142,7 +144,6 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
//Forward declaration
//Forward declaration
static void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
...
@@ -193,6 +194,9 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
...
@@ -193,6 +194,9 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
int nq = *pldq;
int nq = *pldq;
int ldq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int ldh = *pldh;
int worked_on;
worked_on = 0;
// calculating scalar products to compute
// calculating scalar products to compute
// 6 householder vectors simultaneously
// 6 householder vectors simultaneously
...
@@ -303,27 +307,39 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
...
@@ -303,27 +307,39 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
for (i = 0; i < nq-4; i+=8)
for (i = 0; i < nq-4; i+=8)
{
{
hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
}
}
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-8; i+=16)
for (i = 0; i < nq-8; i+=16)
{
{
hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += i;
}
}
#endif
#endif
if (nq == i)
if (nq == i)
{
{
return;
return;
}
}
else
{
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
if (nq-i == 4)
{
hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_4_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
}
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
if (nq-i == 8)
{
hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
worked_on += 8;
}
}
#endif
if (worked_on != nq)
{
//printf("Error in real AVX/AVX2 BLOCK6 kernel \n");
//abort();
}
}
}
/**
/**
...
@@ -1687,6 +1703,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
...
@@ -1687,6 +1703,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
_AVX_STORE(&q[(nb+4)*ldq],q1);
_AVX_STORE(&q[(nb+4)*ldq],q1);
}
}
#if 0
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
/**
/**
* Unrolled kernel that computes
* Unrolled kernel that computes
...
@@ -2257,3 +2274,4 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
...
@@ -2257,3 +2274,4 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
_mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1));
_mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1));
}
}
#endif
#endif
#endif
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment