Commit 0cd6faf8 authored by Andreas Marek's avatar Andreas Marek
Browse files

Single precision kernels could still be called at same time

parent c33fd7a0
...@@ -170,6 +170,7 @@ module compute_hh_trafo_complex ...@@ -170,6 +170,7 @@ module compute_hh_trafo_complex
if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe), & if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width) bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif #endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
...@@ -180,7 +181,8 @@ module compute_hh_trafo_complex ...@@ -180,7 +181,8 @@ module compute_hh_trafo_complex
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. & if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
ttt = mpi_wtime()
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -199,6 +201,7 @@ module compute_hh_trafo_complex ...@@ -199,6 +201,7 @@ module compute_hh_trafo_complex
if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe), & if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width) bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif #endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
...@@ -611,6 +614,8 @@ module compute_hh_trafo_complex ...@@ -611,6 +614,8 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 1, -1 do j = ncols, 1, -1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -621,6 +626,9 @@ module compute_hh_trafo_complex ...@@ -621,6 +626,9 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width) bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
...@@ -631,6 +639,7 @@ module compute_hh_trafo_complex ...@@ -631,6 +639,7 @@ module compute_hh_trafo_complex
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime() ttt = mpi_wtime()
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
do j = ncols, 1, -1 do j = ncols, 1, -1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe,my_thread), & call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
...@@ -640,6 +649,8 @@ module compute_hh_trafo_complex ...@@ -640,6 +649,8 @@ module compute_hh_trafo_complex
bcast_buffer(1,j+off),nbw,nl,stripe_width) bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......
...@@ -877,6 +877,8 @@ module compute_hh_trafo_real ...@@ -877,6 +877,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -888,6 +890,8 @@ module compute_hh_trafo_real ...@@ -888,6 +890,8 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -897,6 +901,8 @@ module compute_hh_trafo_real ...@@ -897,6 +901,8 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL) )
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
...@@ -908,6 +914,9 @@ module compute_hh_trafo_real ...@@ -908,6 +914,9 @@ module compute_hh_trafo_real
w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
#endif #endif
enddo enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL) ) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -975,6 +984,9 @@ module compute_hh_trafo_real ...@@ -975,6 +984,9 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) )
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
...@@ -1008,6 +1020,9 @@ module compute_hh_trafo_real ...@@ -1008,6 +1020,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif #endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) ) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
...@@ -1018,6 +1033,9 @@ module compute_hh_trafo_real ...@@ -1018,6 +1033,9 @@ module compute_hh_trafo_real
if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
...@@ -1053,6 +1071,9 @@ module compute_hh_trafo_real ...@@ -1053,6 +1071,9 @@ module compute_hh_trafo_real
if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif #endif
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment