Commit 9c352986 authored by Andreas Marek's avatar Andreas Marek
Browse files

Better debuging of GPU

parent d2828aed
......@@ -51,7 +51,7 @@
&_&
#endif
&PRECISION &
(obj, a, a_dev, stripe_width, a_dim2, stripe_count, &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP
max_threads, l_nev, &
#endif
......@@ -109,6 +109,7 @@
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
......@@ -126,7 +127,7 @@
integer(kind=ik), intent(in) :: last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
......@@ -137,7 +138,7 @@
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:,:)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
......@@ -171,18 +172,34 @@
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
if (wantDebug) then
if (useGPU .and. &
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
endif
endif
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
if (ncols < 1) then
if (wantDebug) then
print *, "Returning early from compute_hh_trafo"
endif
return
endif
endif
#endif
call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE&
......@@ -248,10 +265,14 @@
#if REALCASE == 1
! GPU kernel real
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE&
&_&
......@@ -261,6 +282,9 @@
#if COMPLEXCASE == 1
! GPU kernel complex
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
......@@ -281,14 +305,20 @@
&MATH_DATATYPE&
&_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
& (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */
if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: GPU")
endif
else ! not CUDA kernel
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: CPU")
endif
#if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
......@@ -385,7 +415,8 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
#endif
#else /* WITH_OPENMP */
......@@ -401,7 +432,8 @@
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -489,7 +521,8 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#else /* WITH_OPENMP */
......@@ -505,7 +538,8 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif
#endif /* WITH_OPENMP */
......@@ -1527,6 +1561,9 @@
!no avx512 block6 complex kernel
#endif /* COMPLEXCASE */
if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: CPU")
endif
endif ! GPU_KERNEL
#ifdef WITH_OPENMP
......
......@@ -1706,7 +1706,7 @@
&MATH_DATATYPE&
&_openmp_&
&PRECISION &
(obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
(obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -1723,7 +1723,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -1839,7 +1839,7 @@
&MATH_DATATYPE&
&_openmp_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -1888,7 +1888,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -1984,7 +1984,7 @@
&MATH_DATATYPE&
&_openmp_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -2002,7 +2002,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -2080,7 +2080,7 @@
&MATH_DATATYPE&
&_openmp_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......@@ -2097,7 +2097,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
& (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment