Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
3328e7bb
Unverified
Commit
3328e7bb
authored
May 23, 2016
by
Andreas Marek
Browse files
Error in AVX2 single precision kernels
parent
69fd894b
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
View file @
3328e7bb
...
...
@@ -585,7 +585,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
h2
=
_mm256_mul_ps
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm256_FMA_ps
(
y1
,
h1
,
_mm256_mul_ps
(
x1
,
h2
));
y2
=
_mm256_FMA_ps
(
y
s
,
h1
,
_mm256_mul_ps
(
x2
,
h2
));
y2
=
_mm256_FMA_ps
(
y
2
,
h1
,
_mm256_mul_ps
(
x2
,
h2
));
// y3 = _mm256_FMA_ps(y3, h1, _mm256_mul_ps(x3,h2));
// y4 = _mm256_FMA_ps(y4, h1, _mm256_mul_ps(x4,h2));
#else
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
View file @
3328e7bb
...
...
@@ -380,7 +380,7 @@ __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb
#ifdef __ELPA_USE_FMA__
y1
=
_mm256_FMA_ps
(
q1
,
h2
,
y1
);
y2
=
_mm256_FMA_ps
(
q2
,
h2
,
y2
);
/
y3
=
_mm256_FMA_ps
(
q3
,
h2
,
y3
);
/
/ y3 = _mm256_FMA_ps(q3, h2, y3);
#else
y1
=
_mm256_add_ps
(
y1
,
_mm256_mul_ps
(
q1
,
h2
));
y2
=
_mm256_add_ps
(
y2
,
_mm256_mul_ps
(
q2
,
h2
));
...
...
@@ -915,7 +915,7 @@ __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb
h3
=
_mm256_mul_ps
(
h1
,
vs_2_3
);
#ifdef __ELPA_USE_FMA__
z1
=
_mm256_FMSUB_ps
(
z1
,
h1
,
_mm256_FMA_ps
(
y1
,
h3
,
_mm256_mul_ps
(
x1
,
h2
)));
z2
=
_mm256_FMSUB_ps
(
z2
,
h1
,
_mm256_FMA_p
d
(
y2
,
h3
,
_mm256_mul_ps
(
x2
,
h2
)));
z2
=
_mm256_FMSUB_ps
(
z2
,
h1
,
_mm256_FMA_p
s
(
y2
,
h3
,
_mm256_mul_ps
(
x2
,
h2
)));
#else
z1
=
_mm256_sub_ps
(
_mm256_mul_ps
(
z1
,
h1
),
_mm256_add_ps
(
_mm256_mul_ps
(
y1
,
h3
),
_mm256_mul_ps
(
x1
,
h2
)));
z2
=
_mm256_sub_ps
(
_mm256_mul_ps
(
z2
,
h1
),
_mm256_add_ps
(
_mm256_mul_ps
(
y2
,
h3
),
_mm256_mul_ps
(
x2
,
h2
)));
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
View file @
3328e7bb
...
...
@@ -54,16 +54,16 @@
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_p
d
(a,b,c) _mm256_macc_p
d
(a,b,c)
#define _mm256_NFMA_p
d
(a,b,c) _mm256_nmacc_p
d
(a,b,c)
#define _mm256_FMSUB_p
d
(a,b,c) _mm256_msub(a,b,c)
#define _mm256_FMA_p
s
(a,b,c) _mm256_macc_p
s
(a,b,c)
#define _mm256_NFMA_p
s
(a,b,c) _mm256_nmacc_p
s
(a,b,c)
#define _mm256_FMSUB_p
s
(a,b,c) _mm256_msub
_ps
(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_p
d
(a,b,c) _mm256_fmadd_p
d
(a,b,c)
#define _mm256_NFMA_p
d
(a,b,c) _mm256_fnmadd_p
d
(a,b,c)
#define _mm256_FMSUB_p
d
(a,b,c) _mm256_fmsub_p
d
(a,b,c)
#define _mm256_FMA_p
s
(a,b,c) _mm256_fmadd_p
s
(a,b,c)
#define _mm256_NFMA_p
s
(a,b,c) _mm256_fnmadd_p
s
(a,b,c)
#define _mm256_FMSUB_p
s
(a,b,c) _mm256_fmsub_p
s
(a,b,c)
#endif
#endif
...
...
@@ -1511,7 +1511,7 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
#endif
h3
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
2
)
+
nb
-
2
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm256_NFMA_p
d
(
z1
,
h3
,
q1
);
q1
=
_mm256_NFMA_p
s
(
z1
,
h3
,
q1
);
#else
q1
=
_mm256_sub_ps
(
q1
,
_mm256_mul_ps
(
z1
,
h3
));
#endif
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment