Commit f7c5807e authored by Pavel Kus's avatar Pavel Kus
Browse files

detailed timing fixes, dummy timer introduced (to avoid ifdefs)

Conflicts:
	src/elpa1_solve_tridi_real_template.X90
parent 6dac0853
...@@ -75,6 +75,9 @@ if HAVE_DETAILED_TIMINGS ...@@ -75,6 +75,9 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/time.c \ src/ftimings/time.c \
src/ftimings/virtual_memory.c \ src/ftimings/virtual_memory.c \
src/ftimings/papi.c src/ftimings/papi.c
else
libelpa@SUFFIX@_private_la_SOURCES += \
src/timer_dummy.F90
endif endif
if WITH_GPU_VERSION if WITH_GPU_VERSION
......
...@@ -187,7 +187,7 @@ AC_MSG_CHECKING(whether ELPA should be build with more detailed timing support) ...@@ -187,7 +187,7 @@ AC_MSG_CHECKING(whether ELPA should be build with more detailed timing support)
AC_ARG_ENABLE([timings], AC_ARG_ENABLE([timings],
AS_HELP_STRING([--enable-timing], AS_HELP_STRING([--enable-timing],
[more detailed timing, default no.]), [more detailed timing, default no.]),
[], [enable_timings=yes],
[enable_timings=no]) [enable_timings=no])
AC_MSG_RESULT([${enable_timings}]) AC_MSG_RESULT([${enable_timings}])
......
...@@ -117,7 +117,7 @@ ...@@ -117,7 +117,7 @@
#endif #endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("merge_systems" + M_PRECISSION_SUFFIX) call timer%start("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
success = .true. success = .true.
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -135,7 +135,7 @@ ...@@ -135,7 +135,7 @@
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
...@@ -155,14 +155,14 @@ ...@@ -155,14 +155,14 @@
call M_check_monotony_PRECISSION(nm,d,'Input1',wantDebug, success) call M_check_monotony_PRECISSION(nm,d,'Input1',wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
call M_check_monotony_PRECISSION(na-nm,d(nm+1),'Input2',wantDebug, success) call M_check_monotony_PRECISSION(na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
...@@ -249,7 +249,7 @@ ...@@ -249,7 +249,7 @@
call M_resort_ev_PRECISSION(idx, na) call M_resort_ev_PRECISSION(idx, na)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
...@@ -363,14 +363,14 @@ ...@@ -363,14 +363,14 @@
call M_check_monotony_PRECISSION(na1,d1,'Sorted1', wantDebug, success) call M_check_monotony_PRECISSION(na1,d1,'Sorted1', wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
call M_check_monotony_PRECISSION(na2,d2,'Sorted2', wantDebug, success) call M_check_monotony_PRECISSION(na2,d2,'Sorted2', wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
...@@ -424,7 +424,7 @@ ...@@ -424,7 +424,7 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) call timer%start("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif #endif
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) !$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
...@@ -472,7 +472,7 @@ ...@@ -472,7 +472,7 @@
!$OMP END PARALLEL !$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) call timer%stop("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif #endif
do i = 0, max_threads-1 do i = 0, max_threads-1
...@@ -493,7 +493,7 @@ ...@@ -493,7 +493,7 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) call timer%start("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif #endif
!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, & !$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, &
...@@ -516,7 +516,7 @@ ...@@ -516,7 +516,7 @@
!$OMP END PARALLEL DO !$OMP END PARALLEL DO
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) call timer%stop("OpenMP parallel" // M_PRECISSION_SUFFIX)
#endif #endif
#endif #endif
...@@ -537,7 +537,7 @@ ...@@ -537,7 +537,7 @@
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
return return
endif endif
...@@ -770,7 +770,7 @@ ...@@ -770,7 +770,7 @@
#endif #endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("merge_systems" + M_PRECISSION_SUFFIX) call timer%stop("merge_systems" // M_PRECISSION_SUFFIX)
#endif #endif
......
...@@ -57,6 +57,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -57,6 +57,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else
use timings_dummy
#endif #endif
use precision use precision
implicit none implicit none
...@@ -79,19 +81,15 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -79,19 +81,15 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
#endif
success = .true. success = .true.
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
...@@ -118,9 +116,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -118,9 +116,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Scalapack supports it but delivers no results for these columns, ! Scalapack supports it but delivers no results for these columns,
! which is rather annoying ! which is rather annoying
if (nc==0) then if (nc==0) then
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width' if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
success = .false. success = .false.
return return
...@@ -148,9 +144,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -148,9 +144,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
call M_solve_tridi_col_PRECISSION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & call M_solve_tridi_col_PRECISSION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, wantDebug, success) matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
return return
endif endif
! If there is only 1 processor column, we are done ! If there is only 1 processor column, we are done
...@@ -162,9 +156,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -162,9 +156,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop stop
endif endif
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
return return
endif endif
...@@ -225,9 +217,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -225,9 +217,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Recursively merge sub problems ! Recursively merge sub problems
call M_merge_recursive_PRECISSION(0, np_cols, wantDebug, success) call M_merge_recursive_PRECISSION(0, np_cols, wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
return return
endif endif
...@@ -237,9 +227,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -237,9 +227,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop stop
endif endif
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" + M_PRECISSION_SUFFIX)
#endif
return return
contains contains
...@@ -364,6 +352,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -364,6 +352,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Works best if the number of processor rows is a power of 2! ! Works best if the number of processor rows is a power of 2!
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else
use timings_dummy
#endif #endif
use precision use precision
implicit none implicit none
...@@ -389,9 +379,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -389,9 +379,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_tridi_col" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi_col" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
#endif #endif
...@@ -576,9 +564,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -576,9 +564,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop stop
endif endif
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi_col" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi_col" + M_PRECISSION_SUFFIX)
#endif
end subroutine M_solve_tridi_col_PRECISSION end subroutine M_solve_tridi_col_PRECISSION
...@@ -588,6 +574,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -588,6 +574,8 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly. ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else
use timings_dummy
#endif #endif
use precision use precision
implicit none implicit none
...@@ -606,9 +594,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -606,9 +594,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_tridi_single" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi_single" + M_PRECISSION_SUFFIX)
#endif
success = .true. success = .true.
allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage) allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
...@@ -702,9 +688,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp ...@@ -702,9 +688,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
endif endif
enddo enddo
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi_single" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi_single" + M_PRECISSION_SUFFIX)
#endif
end subroutine M_solve_tridi_single_problem_PRECISSION end subroutine M_solve_tridi_single_problem_PRECISSION
...@@ -156,7 +156,7 @@ ...@@ -156,7 +156,7 @@
! Upper and lower bound of the shifted solution interval are a and b ! Upper and lower bound of the shifted solution interval are a and b
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_secular_equation" + M_PRECISSION_SUFFIX) call timer%start("solve_secular_equation" // M_PRECISSION_SUFFIX)
#endif #endif
if (i==n) then if (i==n) then
...@@ -222,7 +222,7 @@ ...@@ -222,7 +222,7 @@
dlam = x + dshift dlam = x + dshift
delta(:) = delta(:) - x delta(:) = delta(:) - x
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_secular_equation" + M_PRECISSION_SUFFIX) call timer%stop("solve_secular_equation" // M_PRECISSION_SUFFIX)
#endif #endif
end subroutine M_solve_secular_equation_PRECISSION end subroutine M_solve_secular_equation_PRECISSION
......
...@@ -111,7 +111,7 @@ ...@@ -111,7 +111,7 @@
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_real" + M_PRECISSION_SUFFIX) call timer%start("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif #endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
...@@ -249,7 +249,9 @@ ...@@ -249,7 +249,9 @@
tmat = 0 tmat = 0
if (l_rows>0) & if (l_rows>0) &
call M_PRECISSION_SYRK('U', 'T', nstor, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), M_CONST_0_0, tmat, max_stored_rows) call M_PRECISSION_SYRK('U', 'T', nstor, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
M_CONST_0_0, tmat, max_stored_rows)
nc = 0 nc = 0
do n=1,nstor-1 do n=1,nstor-1
...@@ -272,7 +274,9 @@ ...@@ -272,7 +274,9 @@
nc = 0 nc = 0
tmat(1,1) = tau(ice-nstor+1) tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1 do n=1,nstor-1
call M_PRECISSION_TRMV('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) call M_PRECISSION_TRMV('L', 'T', 'N', n, &
tmat, max_stored_rows, &
h2(nc+1), 1)
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n nc = nc+n
...@@ -288,8 +292,10 @@ ...@@ -288,8 +292,10 @@
! Q = Q - V * T * V**T * Q ! Q = Q - V * T * V**T * Q
if (l_rows>0) then if (l_rows>0) then
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), & call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, &
q, ldq, M_CONST_0_0, tmp1, nstor) M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
M_CONST_0_0, tmp1, nstor)
else else
! if (useGPU) then ! if (useGPU) then
...@@ -316,13 +322,21 @@ ...@@ -316,13 +322,21 @@
if (l_rows>0) then if (l_rows>0) then
#ifdef WITH_MPI #ifdef WITH_MPI
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, M_CONST_1_0, tmat, max_stored_rows, tmp2, nstor) call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, -M_CONST_1_0, hvm, ubound(hvm,dim=1), & M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor, M_CONST_1_0, q, ldq) tmp2, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
#else #else
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, M_CONST_1_0, tmat, max_stored_rows, tmp1, nstor) call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, -M_CONST_1_0, hvm, ubound(hvm,dim=1), & M_CONST_1_0, tmat, max_stored_rows, &
tmp1, nstor, M_CONST_1_0, q, ldq) tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
#endif #endif
endif endif
nstor = 0 nstor = 0
...@@ -342,7 +356,7 @@ ...@@ -342,7 +356,7 @@
! endif ! endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_real" + M_PRECISSION_SUFFIX) call timer%stop("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif #endif
end subroutine M_trans_ev_real_PRECISSION end subroutine M_trans_ev_real_PRECISSION
...@@ -86,6 +86,8 @@ ...@@ -86,6 +86,8 @@
use iso_c_binding use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else
use timings_dummy
#endif #endif
use precision use precision
implicit none implicit none
...@@ -155,9 +157,7 @@ ...@@ -155,9 +157,7 @@
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_real" // M_PRECISSION_SUFFIX)
call timer%start("tridiag_real" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
...@@ -448,9 +448,7 @@ ...@@ -448,9 +448,7 @@
! else !do not use GPU ! else !do not use GPU
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel") call timer%start("OpenMP parallel")
#endif
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) !$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
my_thread = omp_get_thread_num() my_thread = omp_get_thread_num()
...@@ -531,9 +529,7 @@ ...@@ -531,9 +529,7 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
!$OMP END PARALLEL !$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel") call timer%stop("OpenMP parallel")
#endif
do i=0,max_threads-1 do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i) u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i) u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
...@@ -708,6 +704,31 @@ ...@@ -708,6 +704,31 @@
print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
stop stop
endif endif
if (useGPU) then
! todo: should we leave a on the device for further use?
successCUDA = cuda_free(a_dev)
call check_dealloc_CUDA("tridiag_real", "a_dev", successCUDA)
successCUDA = cuda_free(v_row_dev)
call check_dealloc_CUDA("tridiag_real", "v_row_dev", successCUDA)
successCUDA = cuda_free(u_row_dev)
call check_dealloc_CUDA("tridiag_real", "u_row_dev", successCUDA)
successCUDA = cuda_free(v_col_dev)
call check_dealloc_CUDA("tridiag_real", "v_col_dev", successCUDA)
successCUDA = cuda_free(u_col_dev)
call check_dealloc_CUDA("tridiag_real", "u_col_dev", successCUDA)
successCUDA = cuda_free(vu_stored_rows_dev)
call check_dealloc_CUDA("tridiag_real", "vu_stored_rows_dev", successCUDA)
successCUDA = cuda_free(uv_stored_cols_dev)
call check_dealloc_CUDA("tridiag_real", "uv_stored_cols_dev", successCUDA)
endif
! todo dealocate at the GPU ! todo dealocate at the GPU
...@@ -740,9 +761,7 @@ ...@@ -740,9 +761,7 @@
stop stop
endif endif
#ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_real" // M_PRECISSION_SUFFIX)
call timer%stop("tridiag_real" + M_PRECISSION_SUFFIX)
#endif