Commit f7c5807e authored by Pavel Kus's avatar Pavel Kus

detailed timing fixes, dummy timer introduced (to avoid ifdefs)

Conflicts:
	src/elpa1_solve_tridi_real_template.X90
parent 6dac0853
......@@ -75,6 +75,9 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/time.c \
src/ftimings/virtual_memory.c \
src/ftimings/papi.c
else
libelpa@SUFFIX@_private_la_SOURCES += \
src/timer_dummy.F90
endif
if WITH_GPU_VERSION
......
......@@ -187,7 +187,7 @@ AC_MSG_CHECKING(whether ELPA should be build with more detailed timing support)
AC_ARG_ENABLE([timings],
AS_HELP_STRING([--enable-timing],
[more detailed timing, default no.]),
[],
[enable_timings=yes],
[enable_timings=no])
AC_MSG_RESULT([${enable_timings}])
......
......@@ -117,7 +117,7 @@
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("merge_systems" + M_PRECISSION_SUFFIX)
call timer%start("merge_systems" // M_PRECISSION_SUFFIX)
#endif
success = .true.
#ifdef HAVE_DETAILED_TIMINGS
......@@ -135,7 +135,7 @@
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
......@@ -155,14 +155,14 @@
call M_check_monotony_PRECISSION(nm,d,'Input1',wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
call M_check_monotony_PRECISSION(na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
......@@ -249,7 +249,7 @@
call M_resort_ev_PRECISSION(idx, na)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
......@@ -363,14 +363,14 @@
call M_check_monotony_PRECISSION(na1,d1,'Sorted1', wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
call M_check_monotony_PRECISSION(na2,d2,'Sorted2', wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
......@@ -424,7 +424,7 @@
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
call timer%start("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
......@@ -472,7 +472,7 @@
!$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
call timer%stop("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif
do i = 0, max_threads-1
......@@ -493,7 +493,7 @@
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
call timer%start("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif
!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, &
......@@ -516,7 +516,7 @@
!$OMP END PARALLEL DO
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
call timer%stop("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif
#endif
......@@ -537,7 +537,7 @@
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
return
endif
......@@ -770,7 +770,7 @@
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX)
call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif
......
......@@ -57,6 +57,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -79,19 +81,15 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
......@@ -118,9 +116,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Scalapack supports it but delivers no results for these columns,
! which is rather annoying
if (nc==0) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
success = .false.
return
......@@ -148,9 +144,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
call M_solve_tridi_col_PRECISSION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
return
endif
! If there is only 1 processor column, we are done
......@@ -162,9 +156,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
return
endif
......@@ -225,9 +217,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Recursively merge sub problems
call M_merge_recursive_PRECISSION(0, np_cols, wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
return
endif
......@@ -237,9 +227,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
return
contains
......@@ -364,6 +352,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Works best if the number of processor rows is a power of 2!
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -389,9 +379,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi_col" + M_PRECISSION_SUFFIX)
#endif
call timer%start("solve_tridi_col" // M_PRECISSION_SUFFIX)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
......@@ -576,9 +564,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi_col" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi_col" // M_PRECISSION_SUFFIX)
end subroutine M_solve_tridi_col_PRECISSION
......@@ -588,6 +574,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -606,9 +594,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi_single" + M_PRECISSION_SUFFIX)
#endif
call timer%start("solve_tridi_single" // M_PRECISSION_SUFFIX)
success = .true.
allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
......@@ -702,9 +688,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
endif
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi_single" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("solve_tridi_single" // M_PRECISSION_SUFFIX)
end subroutine M_solve_tridi_single_problem_PRECISSION
......@@ -156,7 +156,7 @@
! Upper and lower bound of the shifted solution interval are a and b
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_secular_equation" + M_PRECISSION_SUFFIX)
call timer%start("solve_secular_equation" // M_PRECISSION_SUFFIX)
#endif
if (i==n) then
......@@ -222,7 +222,7 @@
dlam = x + dshift
delta(:) = delta(:) - x
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_secular_equation" + M_PRECISSION_SUFFIX)
call timer%stop("solve_secular_equation" // M_PRECISSION_SUFFIX)
#endif
end subroutine M_solve_secular_equation_PRECISSION
......
......@@ -111,7 +111,7 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_real" + M_PRECISSION_SUFFIX)
call timer%start("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
......@@ -249,7 +249,9 @@
tmat = 0
if (l_rows>0) &
call M_PRECISSION_SYRK('U', 'T', nstor, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), M_CONST_0_0, tmat, max_stored_rows)
call M_PRECISSION_SYRK('U', 'T', nstor, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
M_CONST_0_0, tmat, max_stored_rows)
nc = 0
do n=1,nstor-1
......@@ -272,7 +274,9 @@
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1
call M_PRECISSION_TRMV('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
call M_PRECISSION_TRMV('L', 'T', 'N', n, &
tmat, max_stored_rows, &
h2(nc+1), 1)
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
......@@ -288,8 +292,10 @@
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, M_CONST_0_0, tmp1, nstor)
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
M_CONST_0_0, tmp1, nstor)
else
! if (useGPU) then
......@@ -316,13 +322,21 @@
if (l_rows>0) then
#ifdef WITH_MPI
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, M_CONST_1_0, tmat, max_stored_rows, tmp2, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, -M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, M_CONST_1_0, q, ldq)
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
#else
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, M_CONST_1_0, tmat, max_stored_rows, tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, -M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, M_CONST_1_0, q, ldq)
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
#endif
endif
nstor = 0
......@@ -342,7 +356,7 @@
! endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_real" + M_PRECISSION_SUFFIX)
call timer%stop("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif
end subroutine M_trans_ev_real_PRECISSION
......@@ -86,6 +86,8 @@
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -155,9 +157,7 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("tridiag_real" + M_PRECISSION_SUFFIX)
#endif
call timer%start("tridiag_real" // M_PRECISSION_SUFFIX)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
......@@ -448,9 +448,7 @@
! else !do not use GPU
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
my_thread = omp_get_thread_num()
......@@ -531,9 +529,7 @@
#ifdef WITH_OPENMP
!$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
......@@ -708,6 +704,31 @@
print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
stop
endif
if (useGPU) then
! todo: should we leave a on the device for further use?
successCUDA = cuda_free(a_dev)
call check_dealloc_CUDA("tridiag_real", "a_dev", successCUDA)
successCUDA = cuda_free(v_row_dev)
call check_dealloc_CUDA("tridiag_real", "v_row_dev", successCUDA)
successCUDA = cuda_free(u_row_dev)
call check_dealloc_CUDA("tridiag_real", "u_row_dev", successCUDA)
successCUDA = cuda_free(v_col_dev)
call check_dealloc_CUDA("tridiag_real", "v_col_dev", successCUDA)
successCUDA = cuda_free(u_col_dev)
call check_dealloc_CUDA("tridiag_real", "u_col_dev", successCUDA)
successCUDA = cuda_free(vu_stored_rows_dev)
call check_dealloc_CUDA("tridiag_real", "vu_stored_rows_dev", successCUDA)
successCUDA = cuda_free(uv_stored_cols_dev)
call check_dealloc_CUDA("tridiag_real", "uv_stored_cols_dev", successCUDA)
endif
! todo dealocate at the GPU
......@@ -740,9 +761,7 @@
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_real" + M_PRECISSION_SUFFIX)
#endif
call timer%stop("tridiag_real" // M_PRECISSION_SUFFIX)
contains
......
This diff is collapsed.
......@@ -59,7 +59,7 @@ module ELPA_utilities
private ! By default, all routines contained are private
public :: debug_messages_via_environment_variable, error_unit
public :: check_alloc, check_alloc_CUDA, check_memcpy_CUDA
public :: check_alloc, check_alloc_CUDA, check_memcpy_CUDA, check_dealloc_CUDA
public :: map_global_array_index_to_local_index
public :: pcol, prow
public :: local_index ! Get local index of a block cyclic distributed matrix
......@@ -272,6 +272,21 @@ module ELPA_utilities
endif
end subroutine
subroutine check_dealloc_CUDA(function_name, variable_name, successCUDA)
use precision
implicit none
character(len=*), intent(in) :: function_name
character(len=*), intent(in) :: variable_name
logical :: successCUDA
if (.not.(successCUDA)) then
print *, function_name, ": error in cuda_free when deallocating ", variable_name
stop
endif
end subroutine
subroutine check_memcpy_CUDA(file_name, line, successCUDA)
use precision
......
#include "config-f90.h"
module timings_dummy
implicit none
type, public :: timer_dummy_t
contains
procedure, pass :: start => timer_start
procedure, pass :: stop => timer_stop
end type
type(timer_dummy_t) :: timer
contains
subroutine timer_start(self, name, replace)
class(timer_dummy_t), intent(inout), target :: self
character(len=*), intent(in) :: name
logical, intent(in), optional :: replace
end subroutine
subroutine timer_stop(self, name)
class(timer_dummy_t), intent(inout), target :: self
character(len=*), intent(in), optional :: name
end subroutine
end module timings_dummy
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment