Commit 9c352986 authored by Andreas Marek's avatar Andreas Marek
Browse files

Better debuging of GPU

parent d2828aed
......@@ -51,7 +51,7 @@
&_&
#endif
&PRECISION &
(obj, a, a_dev, stripe_width, a_dim2, stripe_count, &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP
max_threads, l_nev, &
#endif
......@@ -108,81 +108,98 @@
use elpa_generated_fortran_interfaces
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
real(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
complex(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
#endif
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
integer(kind=ik), intent(in) :: last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:)
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:)
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:)
#endif
#else /* WITH_OPENMP */
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:,:)
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:,:)
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:,:)
#endif
#endif /* WITH_OPENMP */
integer(kind=ik), intent(in) :: kernel
integer(kind=ik), intent(in) :: kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
#if REALCASE == 1
integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
#endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe
integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
integer(kind=ik) :: my_thread, noff
#endif
integer(kind=ik) :: j, nl, jj, jjj, n_times
integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6)
real(kind=C_DATATYPE_KIND) :: w(nbw,6)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: w(nbw,2)
complex(kind=C_DATATYPE_KIND) :: w(nbw,2)
#endif
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
if (wantDebug) then
if (useGPU .and. &
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
endif
endif
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
if (ncols < 1) then
if (wantDebug) then
print *, "Returning early from compute_hh_trafo"
endif
return
endif
endif
#endif
call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE&
......@@ -211,16 +228,16 @@
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
......@@ -248,47 +265,60 @@
#if REALCASE == 1
! GPU kernel real
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
&MATH_DATATYPE&
&_&
&PRECISION&
& (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
#endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
&_&
&MATH_DATATYPE
dev_offset_1 = (0 + ( off-1 )* nbw) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
&_&
&MATH_DATATYPE
dev_offset_2 =( off-1 )* size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
&MATH_DATATYPE&
&_&
&PRECISION&
& (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */
if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: GPU")
endif
else ! not CUDA kernel
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: CPU")
endif
#if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
......@@ -321,17 +351,17 @@
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
#endif
......@@ -339,17 +369,17 @@
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */
......@@ -376,32 +406,34 @@
#ifdef USE_ASSUMED_SIZE
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -428,16 +460,16 @@
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#endif
......@@ -445,16 +477,16 @@
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
#endif
......@@ -480,32 +512,34 @@
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -530,17 +564,17 @@
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifndef WITH_FIXED_REAL_KERNEL
......@@ -561,18 +595,18 @@
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_&
&PRECISION&
&_sse_assembly&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#ifndef WITH_FIXED_COMPLEX_KERNEL
......@@ -598,16 +632,16 @@
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_sse_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_sse_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sse_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_sse_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
......@@ -637,16 +671,16 @@
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_avx_avx2_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_avx_avx2_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_avx_avx2_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_avx_avx2_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
......@@ -675,16 +709,16 @@
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_avx512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_avx512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_avx512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
&MATH_DATATYPE&
&_avx512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
......@@ -710,16 +744,16 @@
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sse_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
&MATH_DATATYPE&
&_sse_2hv_&