Commit d5798c37 authored by Andreas Marek's avatar Andreas Marek
Browse files

Cleanup real 256bit kernels

parent 08801ac3
......@@ -70,10 +70,11 @@
#define SPARC64_SSE 1281
#define VSX_SSE 1282
#define NEON_ARCH64_128 1285
#define AVX_256 256
#define AVX_512 512
 
 
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == AVX_512
#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
#include <x86intrin.h>
#endif
 
......@@ -124,7 +125,7 @@
#define SIMD_SET NEON_ARCH64
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#define SIMD_SET AVX_AVX2
#endif
 
......@@ -229,7 +230,7 @@
#endif /* SINGLE_PRECISION_REAL */
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define offset 4
#define __SIMD_DATATYPE __m256d
......@@ -297,7 +298,7 @@
#endif
#endif /* HAVE_AVX2 */
#endif /* SINGLE_PRECISION_REAL */
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -370,7 +371,7 @@
#undef __AVX__
#endif
 
#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256 || VEC_SET == AVX_512
#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
#undef _LOAD
#undef _STORE
#undef _XOR
......@@ -401,7 +402,7 @@
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 4
......@@ -410,7 +411,7 @@
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -444,7 +445,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 8
......@@ -453,7 +454,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -487,7 +488,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 12
......@@ -496,7 +497,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 24
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -531,7 +532,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 16
......@@ -540,7 +541,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 32
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -575,7 +576,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 20
......@@ -584,7 +585,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 40
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -619,7 +620,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 24
......@@ -628,7 +629,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 48
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1218,7 +1219,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
 
#endif /* BLOCK6 */
 
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == AVX_512
#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
#pragma ivdep
#endif
for (i = BLOCK; i < nb; i++)
......@@ -1278,7 +1279,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define STEP_SIZE 24
#define ROW_LENGTH 24
......@@ -1289,7 +1290,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#define ROW_LENGTH 48
#define UPPER_BOUND 40
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1326,14 +1327,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 20
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 40
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1360,14 +1361,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 32
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1395,14 +1396,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 24
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1418,7 +1419,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
worked_on += ROW_LENGTH;
}
 
#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256
#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256
 
#undef ROW_LENGTH
#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
......@@ -1430,14 +1431,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
 
if (nq-i == ROW_LENGTH)
......@@ -1456,14 +1457,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
if (nq-i == ROW_LENGTH)
{
......@@ -1471,7 +1472,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
worked_on += ROW_LENGTH;
}
 
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256 */
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256 */
 
#endif /* BLOCK2 */
 
......@@ -1492,7 +1493,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 12
#define STEP_SIZE 12
......@@ -1503,7 +1504,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#define STEP_SIZE 24
#define UPPER_BOUND 16
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1539,14 +1540,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1574,14 +1575,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1635,7 +1636,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#define STEP_SIZE 8
......@@ -1646,7 +1647,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#define STEP_SIZE 16
#define UPPER_BOUND 8
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1681,14 +1682,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1761,14 +1762,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 24
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 48
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1836,14 +1837,14 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
 
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_REAL
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_REAL
......@@ -1867,7 +1868,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
__SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
#endif
......@@ -1925,7 +1926,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
......@@ -2105,7 +2106,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
__SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
__SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
......@@ -2141,7 +2142,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
__SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
__SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
......@@ -2172,7 +2173,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
__SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
__SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
......@@ -2200,7 +2201,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
......@@ -2461,10 +2462,10 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */
 
q1 = _LOAD(&q[i*ldq]);
q2 = _LOAD(&q[(i*ldq)+offset]);
......@@ -2510,7 +2511,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
#endif
 
......@@ -2538,7 +2539,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
#endif
 
......@@ -2568,7 +2569,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
#endif
 
......@@ -2597,7 +2598,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
 
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
#endif
 
......@@ -2625,7 +2626,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
#endif
 
......@@ -2661,7 +2662,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
#endif
 
......@@ -2689,7 +2690,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
#endif
 
......@@ -2718,7 +2719,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-2]);
#endif
 
......@@ -2753,7 +2754,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
#endif
 
......@@ -2781,7 +2782,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-1]);
#endif
 
......@@ -2821,7 +2822,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
#endif
 
......@@ -2849,7 +2850,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
#endif
 
......@@ -2875,7 +2876,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h1 = _SIMD_SET(hh[nb-4], hh[nb-4]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
#endif
 
......@@ -2908,7 +2909,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h2 = _SIMD_SET(hh[ldh+nb-3], hh[ldh+nb-3]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
#endif
 
......@@ -2934,7 +2935,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
#endif
 
......@@ -2960,7 +2961,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
#endif
 
......@@ -2986,7 +2987,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#if VEC_SET == SPARC64_SSE
h1 = _SIMD_SET(hh[nb-3], hh[nb-3]);
#endif
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-3]);
#endif
 
......@@ -3021,7 +3022,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
#endif
 
......@@ -3049,7 +3050,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
#endif
 
......@@ -3077,7 +3078,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
#endif
q1 = _LOAD(&q[(nb+3)*ldq]);
......@@ -3111,7 +3112,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
#endif
 
......@@ -3139,7 +3140,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
#endif
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
#endif
 
......@@ -3271,7 +3272,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#endif /* VEC_SET == SPARC64_SSE */
 
#if VEC_SET == 256
#if VEC_SET == AVX_256
__SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
__SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
#if defined(BLOCK4) || defined(BLOCK6)
......@@ -3312,10 +3313,10 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
__SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
#endif
#endif /* VEC_SET == 256 */
#endif /* VEC_SET == AVX_256 */