Commit 47da2281 authored by Andreas Marek's avatar Andreas Marek
Browse files

Single precision SSE BLOCK2 kernel for real case

parent 6b2fd8da
...@@ -304,7 +304,8 @@ contains ...@@ -304,7 +304,8 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
! (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
...@@ -659,7 +660,8 @@ contains ...@@ -659,7 +660,8 @@ contains
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
! (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
......
...@@ -354,7 +354,6 @@ module compute_hh_trafo_real ...@@ -354,7 +354,6 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGP_KERNEL */ #endif /* WITH_REAL_BGP_KERNEL */
#if defined(WITH_REAL_BGQ_KERNEL) #if defined(WITH_REAL_BGQ_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
...@@ -396,6 +395,48 @@ module compute_hh_trafo_real ...@@ -396,6 +395,48 @@ module compute_hh_trafo_real
endif ! endif !
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_4hv_double(a(1,j+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_4hv_double(a(1,j+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_2hv_double(a(1,jj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_2hv_double(a(1,jj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) #if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
...@@ -437,6 +478,62 @@ module compute_hh_trafo_real ...@@ -437,6 +478,62 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */ #endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK6) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_real_sse_6hv_double(a(1,j+off+a_off-5,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_real_sse_6hv_double(a(1,j+off+a_off-5,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_4hv_double(a(1,jj+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_4hv_double(a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_2hv_double(a(1,jjj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_2hv_double(a(1,jjj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) #if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
...@@ -492,7 +589,7 @@ module compute_hh_trafo_real ...@@ -492,7 +589,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */ #endif /* WITH_REAL_AVX_BLOCK6_KERNEL */
endif ! GPU_KERNEL endif ! GPU_KERNEL
...@@ -746,8 +843,6 @@ module compute_hh_trafo_real ...@@ -746,8 +843,6 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */ #endif /* WITH_REAL_SSE_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) #if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
...@@ -858,6 +953,7 @@ module compute_hh_trafo_real ...@@ -858,6 +953,7 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
print *,"calling 4"
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2) w(:,3) = bcast_buffer(1:nbw,j+off-2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment