Commit 05cad264 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to unify complex see and avx kernels

parent 9493dde8
...@@ -67,9 +67,10 @@ ...@@ -67,9 +67,10 @@
//define instruction set numbers //define instruction set numbers
#define SSE_128 128 #define SSE_128 128
#define AVX_256 256
#define NEON_ARCH64_128 1285 #define NEON_ARCH64_128 1285
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == 512 #if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == 512
#include <x86intrin.h> #include <x86intrin.h>
#ifdef BLOCK2 #ifdef BLOCK2
#include <pmmintrin.h> #include <pmmintrin.h>
...@@ -99,6 +100,12 @@ ...@@ -99,6 +100,12 @@
#define SIMD_SET SSE #define SIMD_SET SSE
#endif #endif
#if VEC_SET == AVX_256
#define SIMD_SET AVX_AVX2
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2 #define offset 2
#define __SIMD_DATATYPE __m128d #define __SIMD_DATATYPE __m128d
...@@ -130,6 +137,87 @@ ...@@ -130,6 +137,87 @@
#define _SHUFFLE 0xb1 #define _SHUFFLE 0xb1
#endif #endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m256d
#define _SIMD_LOAD _mm256_load_pd
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm256_store_pd
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm256_mul_pd
#define _SIMD_ADD _mm256_add_pd
#define _SIMD_XOR _mm256_xor_pd
#define _SIMD_BROADCAST _mm256_broadcast_sd
#define _SIMD_MADDSUB 1
#define _SIMD_ADDSUB _mm256_addsub_pd
#define _SIMD_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif /* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
#define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
#endif /* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 8
#define __SIMD_DATATYPE __m256
#define _SIMD_LOAD _mm256_load_ps
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm256_store_ps
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm256_mul_ps
#define _SIMD_ADD _mm256_add_ps
#define _SIMD_XOR _mm256_xor_ps
#define _SIMD_BROADCAST _mm256_broadcast_ss
#define _SIMD_MADDSUB 1
#define _SIMD_ADDSUB _mm256_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif
#endif /* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
#define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
#endif /* SINGLE_PRECISION_COMPLEX */
#endif /* VEC_SET == AVX_256 */
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE_INTRINSICS #ifdef HAVE_SSE_INTRINSICS
...@@ -164,6 +252,17 @@ ...@@ -164,6 +252,17 @@
#endif #endif
#endif /* VEC_SET == SSE_128 */ #endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 24
#endif
#endif /* VEC_SET == AVX_256 */
//Forward declaration //Forward declaration
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1 #ifdef BLOCK1
...@@ -173,6 +272,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -173,6 +272,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
,int ldh, DATA_TYPE s); ,int ldh, DATA_TYPE s);
#endif #endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 10
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 20
#endif
#endif /* VEC_SET == AVX_256 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
#if VEC_SET == SSE_128 #if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH #undef ROW_LENGTH
...@@ -183,6 +315,52 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -183,6 +315,52 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#define ROW_LENGTH 8 #define ROW_LENGTH 8
#endif #endif
#endif /* VEC_SET == SSE_128 */ #endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == AVX_256 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#endif /* VEC_SET == AVX_256 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1 #ifdef BLOCK1
); );
...@@ -201,6 +379,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -201,6 +379,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#define ROW_LENGTH 4 #define ROW_LENGTH 4
#endif #endif
#endif /* VEC_SET == SSE_128 */ #endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == AVX_256 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,int ldh, DATA_TYPE s);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 1
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == AVX_256 */
static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
#ifdef BLOCK1 #ifdef BLOCK1
); );
...@@ -209,6 +430,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -209,6 +430,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
,int ldh, DATA_TYPE s); ,int ldh, DATA_TYPE s);
#endif #endif
/* /*
!f>#ifdef HAVE_SSE_INTRINSICS !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
...@@ -239,6 +461,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -239,6 +461,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif !f>#endif
*/ */
/* /*
!f>#ifdef HAVE_SSE_INTRINSICS !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
...@@ -298,16 +521,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -298,16 +521,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef BLOCK1 #ifdef BLOCK1
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6 #define ROW_LENGTH 6
#define STEP_SIZE 6 #define STEP_SIZE 6
#define UPPER_BOUND 4 #define UPPER_BOUND 5
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12 #define ROW_LENGTH 12
#define STEP_SIZE 12 #define STEP_SIZE 12
#define UPPER_BOUND 8 #define UPPER_BOUND 10
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#define STEP_SIZE 24
#define UPPER_BOUND 20
#endif #endif
#endif /* VEC_SET == AVX_256 */
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE) for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
{ {
...@@ -318,6 +557,96 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -318,6 +557,96 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
return; return;
} }
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 5
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 10
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 10
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 20
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH #undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4 #define ROW_LENGTH 4
...@@ -325,12 +654,26 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -325,12 +654,26 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8 #define ROW_LENGTH 8
#endif #endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 1
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH #undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2 #define ROW_LENGTH 2
...@@ -338,11 +681,13 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -338,11 +681,13 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4 #define ROW_LENGTH 4
#endif #endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif /* BLOCK1 */ #endif /* BLOCK1 */
#ifdef BLOCK2 #ifdef BLOCK2
...@@ -372,11 +717,9 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -372,11 +717,9 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#undef ROW_LENGTH #undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2 #define ROW_LENGTH 2
#define STEP_SIZE 2
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4 #define ROW_LENGTH 4
#define STEP_SIZE 4
#endif #endif
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
...@@ -384,26 +727,49 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -384,26 +727,49 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif /* BLOCK2 */ #undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef WITH_DEBUG #define ROW_LENGTH 1
if (worked_on != nq) #endif
{ #ifdef SINGLE_PRECISION_COMPLEX
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq); #define ROW_LENGTH 2
abort(); #endif
} if (nq-i == ROW_LENGTH)
#endif {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#endif /* BLOCK2 */
//#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
abort();
}
//#endif
} }
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6 #define ROW_LENGTH 6
#endif #endif
#ifdef SINGLE_PRECISION_COMPLEX #ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12 #define ROW_LENGTH 12
#endif #endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#endif /* VEC_SET == AVX_256 */