Commit 5cf0ae52 authored by Andreas Marek's avatar Andreas Marek

Dummy timer in elpa2

parent 65d959e7
......@@ -99,6 +99,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
use cuda_functions
......@@ -144,24 +146,18 @@
character(200) :: errorMessage
integer(kind=ik) :: istat
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("bandred_complex_double")
#else
call timer%start("bandred_complex_single")
#endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
......@@ -439,18 +435,14 @@
#endif
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
aux2 = aux1
......@@ -485,18 +477,14 @@
vr(lr+1) = tau
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(vr, lr+1, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, lr+1, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
vmr(1:lr,lc) = vr(1:lr)
......@@ -523,9 +511,7 @@
! Get global dot products
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
......@@ -544,9 +530,7 @@
endif
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
......@@ -790,18 +774,14 @@
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
......@@ -1152,13 +1132,11 @@
endif
endif ! use GPU
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("bandred_complex_double")
#else
call timer%stop("bandred_complex_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine bandred_complex_double
......
......@@ -98,6 +98,8 @@
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
#ifdef WITH_OPENMP
use omp_lib
......@@ -151,20 +153,16 @@
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_real" // M_PRECISION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
......@@ -488,17 +486,13 @@
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(aux1, aux2, 2, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
aux2 = aux1 ! this should be optimized
#endif
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -523,13 +517,9 @@
vr(lr+1) = tau
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(vr, lr+1, M_MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
if (useGPU) then
......@@ -574,13 +564,9 @@
!$omp barrier
!$omp single
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
......@@ -619,13 +605,9 @@
! Get global dot products
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
......@@ -862,15 +844,11 @@
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, ierr)
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
......@@ -983,14 +961,10 @@
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
#endif /* WITH_MPI */
......@@ -1205,9 +1179,7 @@
endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("bandred_real" // M_PRECISION_SUFFIX)
#endif
end subroutine M_bandred_real_PRECISION ! slower for gpu on 10000 10000 ???
......
......@@ -73,6 +73,8 @@ module elpa2_workload
subroutine determine_workload(na, nb, nprocs, limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -82,15 +84,12 @@ module elpa2_workload
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("determine_workload")
#endif
if (na <= 0) then
limits(:) = 0
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
return
endif
......@@ -105,9 +104,7 @@ module elpa2_workload
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
end subroutine
!---------------------------------------------------------------------------------------------------
! divide_band: sets the work distribution in band
......@@ -116,6 +113,8 @@ module elpa2_workload
subroutine divide_band(nblocks_total, n_pes, block_limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -125,9 +124,7 @@ module elpa2_workload
integer(kind=ik) :: n, nblocks, nblocks_left
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("divide_band")
#endif
block_limits(0) = 0
if (nblocks_total < n_pes) then
......@@ -149,9 +146,7 @@ module elpa2_workload
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("divide_band")
#endif
end subroutine
end module elpa2_workload
......@@ -7,6 +7,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -19,9 +21,7 @@
integer(kind=ik) :: i, nc, mpierr
real(kind=REAL_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
nc = 0
do i=1,n
......@@ -30,13 +30,9 @@
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(h1, h2, nc, M_MPI_REAL_PRECISION, MPI_SUM, comm, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
......@@ -62,9 +58,7 @@
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
end subroutine M_symm_matrix_allreduce_PRECISION
......
......@@ -42,6 +42,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use cuda_functions
use iso_c_binding
......@@ -78,24 +80,18 @@
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_band_to_full_complex_double")
#else
call timer%start("trans_ev_band_to_full_complex_single")
#endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -237,18 +233,14 @@
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_DOUBLE_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
......@@ -334,18 +326,14 @@
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
......@@ -490,13 +478,11 @@
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif
endif ! use GPU
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("trans_ev_band_to_full_complex_double")
#else
call timer%stop("trans_ev_band_to_full_complex_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine trans_ev_band_to_full_complex_double
......
......@@ -36,6 +36,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
use cuda_functions
......@@ -75,19 +77,15 @@
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_band_to_full_real" // M_PRECISION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -192,13 +190,9 @@
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(hvb(ns+1), nb-ns, M_MPI_REAL_PRECISION, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
......@@ -272,13 +266,9 @@
! endif
!#endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
......@@ -393,13 +383,9 @@
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(hvb(ns+1), nb-ns, M_MPI_REAL_PRECISION, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
......@@ -432,14 +418,10 @@
call M_PRECISION_GEMM('T', 'N', t_rows, t_cols, l_rows, M_CONST_1_0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, M_CONST_0_0, t_tmp, cwy_blocking)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
call M_PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call M_PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
......@@ -477,14 +459,10 @@
endif ! l_rows>0
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows ,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
if (l_rows>0) then
call M_PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
......@@ -576,9 +554,7 @@
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_band_to_full_real" // M_PRECISION_SUFFIX)
#endif
end subroutine M_trans_ev_band_to_full_real_PRECISION
......
......@@ -37,6 +37,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa2_workload
use pack_unpack_complex
......@@ -156,12 +158,10 @@
integer(kind=ik) :: j1
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_tridi_to_band_complex_double")
#else
call timer%start("trans_ev_tridi_to_band_complex_single")
#endif
#endif
if (useGPU) then
......@@ -180,16 +180,13 @@
max_threads = 1
max_threads = omp_get_max_threads()
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
......@@ -438,13 +435,11 @@
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -455,13 +450,11 @@
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#endif /* WITH_OPENMP */
......@@ -479,17 +472,13 @@
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
......@@ -503,28 +492,20 @@
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_double(i - limits(ip), .false.)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
call timer%stop("mpi_communication")
#else
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif
else
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS