Commit 9c352986 authored by Andreas Marek's avatar Andreas Marek
Browse files

Better debuging of GPU

parent d2828aed
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
&_& &_&
#endif #endif
&PRECISION & &PRECISION &
(obj, a, a_dev, stripe_width, a_dim2, stripe_count, & (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
max_threads, l_nev, & max_threads, l_nev, &
#endif #endif
...@@ -109,6 +109,7 @@ ...@@ -109,6 +109,7 @@
implicit none implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size integer(kind=ik), intent(in) :: nbw, max_blk_size
...@@ -126,7 +127,7 @@ ...@@ -126,7 +127,7 @@
integer(kind=ik), intent(in) :: last_stripe_width integer(kind=ik), intent(in) :: last_stripe_width
#if REALCASE == 1 #if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count) ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count) ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
...@@ -137,7 +138,7 @@ ...@@ -137,7 +138,7 @@
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
#if REALCASE == 1 #if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer:: a(:,:,:,:) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
...@@ -171,18 +172,34 @@ ...@@ -171,18 +172,34 @@
real(kind=c_double) :: ttt ! MPI_WTIME always needs double real(kind=c_double) :: ttt ! MPI_WTIME always needs double
if (wantDebug) then
if (useGPU .and. &
#if REALCASE == 1 #if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available #endif
if (ncols < 1) return #if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
endif endif
endif
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return if (ncols < 1) then
if (wantDebug) then
print *, "Returning early from compute_hh_trafo"
endif
return
endif
endif endif
#endif
call obj%timer%start("compute_hh_trafo_& call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -248,10 +265,14 @@ ...@@ -248,10 +265,14 @@
#if REALCASE == 1 #if REALCASE == 1
! GPU kernel real ! GPU kernel real
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_& dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
call launch_compute_hh_trafo_gpu_kernel_& call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
...@@ -261,6 +282,9 @@ ...@@ -261,6 +282,9 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! GPU kernel complex ! GPU kernel complex
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
endif
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_& dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION& &PRECISION&
...@@ -281,14 +305,20 @@ ...@@ -281,14 +305,20 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, & & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols) hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: GPU")
endif
else ! not CUDA kernel else ! not CUDA kernel
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: CPU")
endif
#if REALCASE == 1 #if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
...@@ -385,7 +415,8 @@ ...@@ -385,7 +415,8 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_generic_& &_generic_&
&PRECISION& &PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
...@@ -401,7 +432,8 @@ ...@@ -401,7 +432,8 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_generic_& &_generic_&
&PRECISION& &PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif #endif
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
...@@ -489,7 +521,8 @@ ...@@ -489,7 +521,8 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_generic_simple_& &_generic_simple_&
&PRECISION& &PRECISION&
& (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
...@@ -505,7 +538,8 @@ ...@@ -505,7 +538,8 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_generic_simple_& &_generic_simple_&
&PRECISION& &PRECISION&
& (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
nbw, nl, stripe_width)
#endif #endif
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
...@@ -1527,6 +1561,9 @@ ...@@ -1527,6 +1561,9 @@
!no avx512 block6 complex kernel !no avx512 block6 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
if (wantDebug) then
call obj%timer%stop("compute_hh_trafo: CPU")
endif
endif ! GPU_KERNEL endif ! GPU_KERNEL
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
......
...@@ -1706,7 +1706,7 @@ ...@@ -1706,7 +1706,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_openmp_& &_openmp_&
&PRECISION & &PRECISION &
(obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -1723,7 +1723,7 @@ ...@@ -1723,7 +1723,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -1839,7 +1839,7 @@ ...@@ -1839,7 +1839,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_openmp_& &_openmp_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -1888,7 +1888,7 @@ ...@@ -1888,7 +1888,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -1984,7 +1984,7 @@ ...@@ -1984,7 +1984,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_openmp_& &_openmp_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -2002,7 +2002,7 @@ ...@@ -2002,7 +2002,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -2080,7 +2080,7 @@ ...@@ -2080,7 +2080,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_openmp_& &_openmp_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
...@@ -2097,7 +2097,7 @@ ...@@ -2097,7 +2097,7 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
hh_dot_dev, & hh_dot_dev, &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment