Unverified Commit f5feb969 authored by Andreas Marek's avatar Andreas Marek
Browse files

Alignment error due to wrong stripe_width

In case of single precision calculations the stripe_width needs to
be a multiple, which differs from the double precision by a factor of 2
since one needs 32 bytes alignment and the sizeof(float) and sizeof(double)
is different by a factor of two

This commit closes issue #18
parent 789121d6
......@@ -3155,7 +3155,11 @@
if (useGPU) then
stripe_width = 256
else
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
endif
#ifdef WITH_OPENMP
......@@ -3185,7 +3189,13 @@
#endif /* WITH_OPENMP */
if (.not.(useGPU)) then
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+1)/2)*2 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
endif
#ifndef WITH_OPENMP
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
......
......@@ -3539,7 +3539,11 @@
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 4
#else
stripe_width = 96 ! Must be a multiple of 8
#endif
#ifdef WITH_OPENMP
stripe_count = (thread_width-1)/stripe_width + 1
#else
......@@ -3551,7 +3555,13 @@
#else
stripe_width = (l_nev-1)/stripe_count + 1
#endif
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
#endif
else ! GPUs are used
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
......
......@@ -61,7 +61,6 @@
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
......@@ -81,7 +80,9 @@
#endif
//Forward declaration
// 4 rows single presision does not work in AVX since it cannot be 32 aligned use sse instead
__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
//__forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
......@@ -139,6 +140,8 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
{
hh_trafo_kernel_16_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+16], hh, nb, ldq, ldh, s);
// hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(&q[i+8], hh, nb, ldq, ldh, s);
}
else if (nq-i == 16)
{
......@@ -148,6 +151,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
{
hh_trafo_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX_2hv_single(&q[i+8], hh, nb, ldq, ldh, s);
// hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(&q[i+8], hh, nb, ldq, ldh, s);
}
else if (nq-i == 8)
{
......@@ -156,6 +160,8 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
else
{
hh_trafo_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
// hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(&q[i], hh, nb, ldq, ldh, s);
}
}
......@@ -870,6 +876,105 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
#endif
}
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
// carefull here
__m128 sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
__m128 x1 = _mm_load_ps(&q[ldq]);
__m128 x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1]));
__m128 x3 ;
__m128 h1 = _mm_moveldup_ps(x2);
__m128 h2;
__m128 q1 = _mm_load_ps(q);
__m128 y1 = _mm_add_ps(q1, _mm_mul_ps(x1, h1));
for(i = 2; i < nb; i++)
{
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1]));
h1 = _mm_moveldup_ps(x2);
x3 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i]));
h2 = _mm_moveldup_ps(x3);
q1 = _mm_load_ps(&q[i*ldq]);
x1 = _mm_add_ps(x1, _mm_mul_ps(q1,h1));
y1 = _mm_add_ps(y1, _mm_mul_ps(q1,h2));
}
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1]));
h1 = _mm_moveldup_ps(x2);
q1 = _mm_load_ps(&q[nb*ldq]);
x1 = _mm_add_ps(x1, _mm_mul_ps(q1,h1));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) hh));
__m128 tau1 = _mm_moveldup_ps(x2);
x3 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh]));
__m128 tau2 = _mm_moveldup_ps(x3);
__m128 x4 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &s));
__m128 vs = _mm_moveldup_ps(x4);
h1 = _mm_xor_ps(tau1, sign);
x1 = _mm_mul_ps(x1, h1);
h1 = _mm_xor_ps(tau2, sign);
h2 = _mm_mul_ps(h1, vs);
y1 = _mm_add_ps(_mm_mul_ps(y1,h1), _mm_mul_ps(x1,h2));
q1 = _mm_load_ps(q);
q1 = _mm_add_ps(q1, y1);
_mm_store_ps(q,q1);
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+1]));
h2 = _mm_moveldup_ps(x2);
q1 = _mm_load_ps(&q[ldq]);
q1 = _mm_add_ps(q1, _mm_add_ps(x1, _mm_mul_ps(y1, h2)));
_mm_store_ps(&q[ldq],q1);
for (i = 2; i < nb; i++)
{
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[i-1]));
h1 = _mm_moveldup_ps(x2);
x3 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[ldh+i]));
h2 = _mm_moveldup_ps(x3);
q1 = _mm_load_ps(&q[i*ldq]);
q1 = _mm_add_ps(q1, _mm_add_ps(_mm_mul_ps(x1,h1), _mm_mul_ps(y1, h2)));
_mm_store_ps(&q[i*ldq],q1);
}
x2 = _mm_castpd_ps(_mm_loaddup_pd( (double *) &hh[nb-1]));
h1 = _mm_moveldup_ps(x2);
q1 = _mm_load_ps(&q[nb*ldq]);
q1 = _mm_add_ps(q1, _mm_mul_ps(x1, h1));
_mm_store_ps(&q[nb*ldq],q1);
}
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
......
......@@ -90,7 +90,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
void quad_hh_trafo_real_avx_avx2_4hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/*
!f>#ifdef HAVE_AVX
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment