Commit bf7c6410 authored by Sebastian Ohlmann's avatar Sebastian Ohlmann

Add support for NVTX profiling

When profiling the GPU version, NVTX can be used to highlight the
corresponding regions of the code in the timeline of the profiling tool
(nvvp or nsight systems). This is very useful to correlate what happens
on the GPU with what part of the code we are in.
parent ee0e63d0
......@@ -1251,6 +1251,29 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
[build and install nvtx wrapper for profiling th GPU version, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_nvtx=yes
else
enable_nvtx=no
fi
],
[enable_nvtx=no])
AC_MSG_RESULT([${enable_nvtx}])
if test x"${enable_nvtx}" = x"yes"; then
AC_DEFINE([WITH_NVTX],[1],[enable NVTX support])
AC_LANG_PUSH([C])
AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
if test x"${have_nvtoolsext}" = x"no"; then
AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable GPU support ])
fi
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
......
......@@ -465,8 +465,41 @@ module cuda_functions
end interface
#ifdef WITH_NVTX
! NVTX profiling interfaces
interface nvtxRangePushA
subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
use iso_c_binding
character(kind=C_CHAR,len=1) :: name(*)
end subroutine
end interface
interface nvtxRangePop
subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
end subroutine
end interface
#endif
contains
#ifdef WITH_NVTX
! this wrapper is needed for the string conversion
subroutine nvtxRangePush(range_name)
implicit none
character(len=*), intent(in) :: range_name
character(kind=C_CHAR,len=1), dimension(len(range_name)+1) :: c_name
integer i
do i = 1, len(range_name)
c_name(i) = range_name(i:i)
end do
c_name(len(range_name)+1) = char(0)
call nvtxRangePushA(c_name)
end subroutine
#endif
! functions to set and query the CUDA devices
function cublas_create(handle) result(success)
......
......@@ -131,6 +131,9 @@ function elpa_solve_evp_&
#else
nrThreads = 1
#endif
#ifdef WITH_NVTX
call nvtxRangePush("elpa1")
#endif
success = .true.
......@@ -327,6 +330,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("tridi")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("tridi")
#endif
call tridiag_&
&MATH_DATATYPE&
......@@ -334,6 +340,9 @@ function elpa_solve_evp_&
&PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("tridi")
#endif
......@@ -345,6 +354,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("solve")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("solve")
#endif
call solve_tridi_&
&PRECISION&
......@@ -357,6 +369,9 @@ function elpa_solve_evp_&
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("solve")
#endif
......@@ -398,6 +413,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("trans_ev")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("trans_ev")
#endif
call trans_ev_&
&MATH_DATATYPE&
......@@ -405,6 +423,9 @@ function elpa_solve_evp_&
&PRECISION&
& (obj, na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("trans_ev")
#endif
......@@ -445,6 +466,9 @@ function elpa_solve_evp_&
endif
endif
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
! store the number of OpenMP threads used in the calling function
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment