Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Ohlmann
elpa
Commits
05cad264
Commit
05cad264
authored
Jun 07, 2019
by
Andreas Marek
Browse files
Start to unify complex see and avx kernels
parent
9493dde8
Changes
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
05cad264
...
...
@@ -67,9 +67,10 @@
//define instruction set numbers
#define SSE_128 128
#define AVX_256 256
#define NEON_ARCH64_128 1285
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == 512
#if VEC_SET == SSE_128 || VEC_SET ==
AVX_
256 || VEC_SET == 512
#include
<x86intrin.h>
#ifdef BLOCK2
#include
<pmmintrin.h>
...
...
@@ -99,6 +100,12 @@
#define SIMD_SET SSE
#endif
#if VEC_SET == AVX_256
#define SIMD_SET AVX_AVX2
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SIMD_DATATYPE __m128d
...
...
@@ -130,6 +137,87 @@
#define _SHUFFLE 0xb1
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m256d
#define _SIMD_LOAD _mm256_load_pd
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm256_store_pd
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm256_mul_pd
#define _SIMD_ADD _mm256_add_pd
#define _SIMD_XOR _mm256_xor_pd
#define _SIMD_BROADCAST _mm256_broadcast_sd
#define _SIMD_MADDSUB 1
#define _SIMD_ADDSUB _mm256_addsub_pd
#define _SIMD_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
/* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
#define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
#endif
/* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 8
#define __SIMD_DATATYPE __m256
#define _SIMD_LOAD _mm256_load_ps
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm256_store_ps
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm256_mul_ps
#define _SIMD_ADD _mm256_add_ps
#define _SIMD_XOR _mm256_xor_ps
#define _SIMD_BROADCAST _mm256_broadcast_ss
#define _SIMD_MADDSUB 1
#define _SIMD_ADDSUB _mm256_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif
#endif
/* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
#define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
#endif
/* SINGLE_PRECISION_COMPLEX */
#endif
/* VEC_SET == AVX_256 */
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE_INTRINSICS
...
...
@@ -164,6 +252,17 @@
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 24
#endif
#endif
/* VEC_SET == AVX_256 */
//Forward declaration
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
...
...
@@ -173,6 +272,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 10
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 20
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
...
...
@@ -183,6 +315,52 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
...
...
@@ -201,6 +379,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 1
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
...
...
@@ -209,6 +430,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
,
int
ldh
,
DATA_TYPE
s
);
#endif
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
...
...
@@ -239,6 +461,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
...
...
@@ -298,16 +521,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef BLOCK1
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#define STEP_SIZE 6
#define UPPER_BOUND
4
#define UPPER_BOUND
5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 8
#define UPPER_BOUND 10
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#define STEP_SIZE 24
#define UPPER_BOUND 20
#endif
#endif
/* VEC_SET == AVX_256 */
for
(
i
=
0
;
i
<
nq
-
UPPER_BOUND
;
i
+=
STEP_SIZE
)
{
...
...
@@ -318,6 +557,96 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
return
;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 10
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 20
#endif
#endif
/* VEC_SET == AVX_256 */
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#endif
/* VEC_SET == AVX_256 */
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#endif
/* VEC_SET == AVX_256 */
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
...
...
@@ -325,12 +654,26 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == AVX_256 */
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 1
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
...
...
@@ -338,11 +681,13 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == AVX_256 */
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#endif
/* BLOCK1 */
#ifdef BLOCK2
...
...
@@ -372,11 +717,9 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#define STEP_SIZE 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#endif
if
(
nq
-
i
==
ROW_LENGTH
)
{
...
...
@@ -384,26 +727,49 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
worked_on
+=
ROW_LENGTH
;
}
#endif
/* BLOCK2 */
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in complex SIMD_SET BLOCK BLOCK kernel %d %d
\n
"
,
worked_on
,
nq
);
abort
();
}
#endif
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 1
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
/* BLOCK2 */
//#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in complex SIMD_SET BLOCK BLOCK kernel %d %d
\n
"
,
worked_on
,
nq
);
abort
();
}
//#endif
}
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
)
...
...
@@ -1178,18 +1544,32 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
_SIMD_STORE
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
offset
],
q2
);
_SIMD_STORE
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
2
*
offset
],
q3
);
_SIMD_STORE
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
3
*
offset
],
q4
);
_SIMD_STORE
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
*
offset
],
q5
);
_SIMD_STORE
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
5
*
offset
],
q6
);
#endif
/* BLOCK2 */
}
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH
4
#define ROW_LENGTH
5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#define ROW_LENGTH 10
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 20
#endif
#endif
/* VEC_SET == AVX_256 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
)
...
...
@@ -1198,20 +1578,21 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
,
int
ldh
,
DATA_TYPE
s
)
#endif
{
DATA_TYPE_REAL_PTR
q_dbl
=
(
DATA_TYPE_REAL_PTR
)
q
;
DATA_TYPE_REAL_PTR
hh_dbl
=
(
DATA_TYPE_REAL_PTR
)
hh
;
#ifdef BLOCK2
DATA_TYPE_REAL_PTR
s_dbl
=
(
DATA_TYPE_REAL_PTR
)(
&
s
);
#endif
__SIMD_DATATYPE
x1
,
x2
,
x3
,
x4
;
__SIMD_DATATYPE
q1
,
q2
,
q3
,
q4
;
__SIMD_DATATYPE
x1
,
x2
,
x3
,
x4
,
x5
;
__SIMD_DATATYPE
q1
,
q2
,
q3
,
q4
,
q5
;
#ifdef BLOCK2
__SIMD_DATATYPE
y1
,
y2
,
y3
,
y4
;
__SIMD_DATATYPE
y1
,
y2
,
y3
,
y4
,
y5
;
__SIMD_DATATYPE
h2_real
,
h2_imag
;
#endif
__SIMD_DATATYPE
h1_real
,
h1_imag
;
__SIMD_DATATYPE
tmp1
,
tmp2
,
tmp3
,
tmp4
;
__SIMD_DATATYPE
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
;
int
i
=
0
;
#if VEC_SET == SSE_128
...
...
@@ -1219,16 +1600,17 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm_set_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
);
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm_set_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
);
#endif
#endif
/* VEC_SET == SSE_128 */
#ifdef BLOCK2
x1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
offset
]);
x3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
2
*
offset
]);
x4
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
3
*
offset
]);
x5
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
4
*
offset
]);
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -1250,6 +1632,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
y2
=
_SIMD_LOAD
(
&
q_dbl
[
offset
]);
y3
=
_SIMD_LOAD
(
&
q_dbl
[
2
*
offset
]);
y4
=
_SIMD_LOAD
(
&
q_dbl
[
3
*
offset
]);
y5
=
_SIMD_LOAD
(
&
q_dbl
[
4
*
offset
]);
tmp1
=
_SIMD_MUL
(
h2_imag
,
x1
);