Commit fc609543 authored by Andreas Marek's avatar Andreas Marek
Browse files

Use all rows in new kernel

parent 38c1d3c3
...@@ -1177,35 +1177,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -1177,35 +1177,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512 //#if VEC_SET != AVX_512
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE) for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on +=ROW_LENGTH; worked_on +=ROW_LENGTH;
} }
#endif //#endif
#if VEC_SET == AVX_512 //#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX //#ifdef DOUBLE_PRECISION_COMPLEX
for (i = 0; i < nq - 12; i+=16) // for (i = 0; i < nq - 12; i+=16)
{ // {
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+12], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+12], hh, nb, ldq, ldh, s);
worked_on +=16; // worked_on +=16;
} // }
#endif //#endif
#ifdef SINGLE_PRECISION_COMPLEX //#ifdef SINGLE_PRECISION_COMPLEX
for (i = 0; i < nq - 24; i+=32) // for (i = 0; i < nq - 24; i+=32)
{ // {
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+24], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+24], hh, nb, ldq, ldh, s);
worked_on +=32; // worked_on +=32;
} // }
#endif //#endif
#endif //#endif
if (nq == i) if (nq == i)
{ {
...@@ -1242,34 +1242,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -1242,34 +1242,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif #endif
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512 //#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif //#endif
#if VEC_SET == AVX_512 //#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX //#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 12) // if (nq-i == 12)
{ // {
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s);
worked_on +=12; // worked_on +=12;
} // }
#endif //#endif
#ifdef SINGLE_PRECISION_COMPLEX //#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 24) // if (nq-i == 24)
{ // {
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s);
worked_on +=24; // worked_on +=24;
} // }
#endif //#endif
//
#endif //#endif
#if VEC_SET == SSE_128 #if VEC_SET == SSE_128
#undef ROW_LENGTH #undef ROW_LENGTH
...@@ -1301,32 +1301,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -1301,32 +1301,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif #endif
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512 //#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif //#endif
#if VEC_SET == AVX_512 //#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX //#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 8) // if (nq-i == 8)
{ // {
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
worked_on +=8; // worked_on +=8;
} // }
#endif //#endif
#ifdef SINGLE_PRECISION_COMPLEX //#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 16) // if (nq-i == 16)
{ // {
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
worked_on +=16; // worked_on +=16;
} // }
#endif //#endif
//
#endif //#endif
#if VEC_SET == SSE_128 #if VEC_SET == SSE_128
#undef ROW_LENGTH #undef ROW_LENGTH
...@@ -1358,40 +1358,40 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D ...@@ -1358,40 +1358,40 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif #endif
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512 //#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH) if (nq-i == ROW_LENGTH)
{ {
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif //#endif
#if VEC_SET == AVX_512 //#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX //#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 4) // if (nq-i == 4)
{ // {
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
worked_on +=4; // worked_on +=4;
} // }
#endif //#endif
#ifdef SINGLE_PRECISION_COMPLEX //#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 8) // if (nq-i == 8)
{ // {
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s); // hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
worked_on +=8; // worked_on +=8;
} // }
#endif //#endif
//
#endif //#endif
#endif /* BLOCK2 */ #endif /* BLOCK2 */
//#ifdef WITH_DEBUG #ifdef WITH_DEBUG
if (worked_on != nq) if (worked_on != nq)
{ {
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq); printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
abort(); abort();
} }
//#endif #endif
} }
...@@ -5147,13 +5147,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -5147,13 +5147,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == SSE_128 */ #endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256 #if VEC_SET == AVX_256
h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]); h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */ #endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512 #if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
...@@ -5561,13 +5561,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -5561,13 +5561,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == SSE_128 */ #endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256 #if VEC_SET == AVX_256
h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]); h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */ #endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512 #if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
...@@ -5607,8 +5607,8 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -5607,8 +5607,8 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == AVX_256 */ #endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512 #if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]); h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]); h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
tmp1 = _SIMD_MUL(h2_imag, y1); tmp1 = _SIMD_MUL(h2_imag, y1);
...@@ -5820,8 +5820,8 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM ...@@ -5820,8 +5820,8 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == AVX_256 */ #endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512 #if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */ #endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment