Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
8fd36a16
Commit
8fd36a16
authored
Jun 07, 2016
by
Andreas Marek
Browse files
Error in computing the stripe_width for AVX-512 kernels
parent
6788de0b
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/elpa2_compute_real_template.X90
View file @
8fd36a16
...
...
@@ -3544,7 +3544,7 @@
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 4
#else
stripe_width =
96
! Must be a multiple of 8
stripe_width =
48
! Must be a multiple of 8
#endif
#ifdef WITH_OPENMP
stripe_count = (thread_width-1)/stripe_width + 1
...
...
@@ -3562,12 +3562,12 @@
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6) then
stripe_width = ((stripe_width+
3
)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
stripe_width = ((stripe_width+
7
)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
endif
#else
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
...
...
src/elpa2_kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
View file @
8fd36a16
...
...
@@ -216,7 +216,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
x3
=
_mm512_mul_pd
(
x3
,
h1
);
x4
=
_mm512_mul_pd
(
x4
,
h1
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h2
=
_mm512_mul_pd
(
h1
,
vs
);
y1
=
_mm512_FMA_pd
(
y1
,
h1
,
_mm512_mul_pd
(
x1
,
h2
));
y2
=
_mm512_FMA_pd
(
y2
,
h1
,
_mm512_mul_pd
(
x2
,
h2
));
...
...
@@ -364,7 +364,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
x2
=
_mm512_mul_pd
(
x2
,
h1
);
x3
=
_mm512_mul_pd
(
x3
,
h1
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h2
=
_mm512_mul_pd
(
h1
,
vs
);
y1
=
_mm512_FMA_pd
(
y1
,
h1
,
_mm512_mul_pd
(
x1
,
h2
));
y2
=
_mm512_FMA_pd
(
y2
,
h1
,
_mm512_mul_pd
(
x2
,
h2
));
...
...
@@ -481,11 +481,11 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__m512d
tau2
=
_mm512_set1_pd
(
hh
[
ldh
]);
__m512d
vs
=
_mm512_set1_pd
(
s
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau1
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau1
,
(
__m512i
)
sign
);
x1
=
_mm512_mul_pd
(
x1
,
h1
);
x2
=
_mm512_mul_pd
(
x2
,
h1
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h2
=
_mm512_mul_pd
(
h1
,
vs
);
y1
=
_mm512_FMA_pd
(
y1
,
h1
,
_mm512_mul_pd
(
x1
,
h2
));
...
...
@@ -580,10 +580,10 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__m512d
tau2
=
_mm512_set1_pd
(
hh
[
ldh
]);
__m512d
vs
=
_mm512_set1_pd
(
s
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau1
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau1
,
(
__m512i
)
sign
);
x1
=
_mm512_mul_pd
(
x1
,
h1
);
h1
=
(
__m512d
)
_mm512_xor_
si512
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h1
=
(
__m512d
)
_mm512_xor_
epi64
((
__m512i
)
tau2
,
(
__m512i
)
sign
);
h2
=
_mm512_mul_pd
(
h1
,
vs
);
y1
=
_mm512_FMA_pd
(
y1
,
h1
,
_mm512_mul_pd
(
x1
,
h2
));
...
...
@@ -664,9 +664,9 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__m512d tau2 = _mm512_set1_pd(hh[ldh]);
__m512d vs = _mm512_set1_pd(s);
h1 = (__m512d) _mm512_xor_
si512
((__m512i) tau1, (__m512i) sign);
h1 = (__m512d) _mm512_xor_
epi64
((__m512i) tau1, (__m512i) sign);
x1 = _mm512_mul_pd(x1, h1);
h1 = (__m512d) _mm512_xor_
si512
((__m512i) tau2, (__m512i) sign);
h1 = (__m512d) _mm512_xor_
epi64
((__m512i) tau2, (__m512i) sign);
h2 = _mm512_mul_pd(h1, vs);
y1 = _mm512_FMA_pd(y1, h1, _mm512_mul_pd(x1,h2));
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment