diff --git a/Doxyfile.in b/Doxyfile.in index e52a9eb4feb9192e94b0193740663d92074dd6fe..d464ae3920efd75434b85119ed3248cb5f6d7d4a 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -733,7 +733,7 @@ WARN_LOGFILE = # spaces. # Note: If this tag is empty the current directory is searched. -INPUT = ftimings/ +INPUT = @SRCDIR@/ftimings/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doxygen.am b/doxygen.am index cf37f1a5c538f3b4d58ad2b3ee38cca89fba49c9..f869a591a3ce75d6a872f8eae2a5b317f1453089 100644 --- a/doxygen.am +++ b/doxygen.am @@ -138,7 +138,7 @@ doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL) @DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS) rm -rf @DX_DOCDIR@ - $(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG) + $(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG) DX_CLEANFILES = \ @DX_DOCDIR@/@PACKAGE@.tag \ diff --git a/ftimings/ftimings.F90 b/ftimings/ftimings.F90 index 0009263ab8b52067370d12d7902fe77854803f6f..8ba1b453e4015443ae98ca26228709cc7d495c71 100644 --- a/ftimings/ftimings.F90 +++ b/ftimings/ftimings.F90 @@ -2,9 +2,12 @@ #include "config-f90.h" #endif -!> Ftimings +!> \mainpage Ftimings !> -!> An almost pure-fortran attempt to play with graph structures :) +!> An almost pure-fortran attempt to play with tree structures, which evolved +!> into the timing library used e.g. by the VERTEX supernova code. +!> +!> All you need to know is contained in the \ref ftimings::timer_t derived type. module ftimings use ftimings_type use ftimings_value @@ -14,19 +17,21 @@ module ftimings private - ! this is mainly needed for Doxygen... + ! this is mainly needed for Doxygen, they are + ! by implicitly reachable as type-bound procedures + ! of timer_t, however Doxygen does not document them + ! if they are not also public public timer_start, timer_stop, timer_free, timer_print, & timer_enable, timer_disable, timer_is_enabled, & - timer_in_entries, timer_get, timer_since, timer_sort + timer_in_entries, timer_get, timer_since, timer_sort, & + timer_set_print_options, & + timer_measure_flops, & + timer_measure_allocated_memory, & + timer_measure_memory_bandwidth character(len=name_length), private, parameter :: own = "(own)" character(len=name_length), private, parameter :: below = "(below threshold)" -#ifdef HAVE_LIBPAPI - logical :: papi_supported = .false. - logical :: have_tried_papi_init = .false. -#endif - !> Type for a timer instance. !> !> Typical usage: @@ -54,22 +59,33 @@ module ftimings !> call timer%free() !> \endcode type, public :: timer_t - private - logical :: active = .false. !< If set to .false., most operations return immediately without any action - logical :: record_memory = .false. !< IF set to .true., record also the current resident set size - logical :: record_flop_counts = .false. !< If set to .true., record also FLOP counts via PAPI calls - type(node_t), pointer :: root => NULL() !< Start of graph - type(node_t), pointer :: current_node => NULL() !< Current position in the graph + logical, private :: active = .false. !< If set to .false., most operations return immediately without any action + logical, private :: record_allocated_memory = .false. !< IF set to .true., record also the current resident set size + logical, private :: record_flop_counts = .false. !< If set to .true., record also FLOP counts via PAPI calls + logical, private :: record_memory_bandwidth = .false. !< If set to .true., record also FLOP counts via PAPI calls + + logical, private :: print_allocated_memory = .false. + logical, private :: print_flop_count = .false. + logical, private :: print_flop_rate = .false. + logical, private :: print_ldst = .false. + logical, private :: print_memory_bandwidth = .false. + logical, private :: print_ai = .false. + integer, private :: bytes_per_ldst = 8 + + type(node_t), private, pointer :: root => NULL() !< Start of graph + type(node_t), private, pointer :: current_node => NULL() !< Current position in the graph contains procedure, pass :: start => timer_start procedure, pass :: stop => timer_stop procedure, pass :: free => timer_free procedure, pass :: print => timer_print procedure, pass :: enable => timer_enable - procedure, pass :: measure_flops => timer_measure_flops - procedure, pass :: measure_memory => timer_measure_memory procedure, pass :: disable => timer_disable procedure, pass :: is_enabled => timer_is_enabled + procedure, pass :: measure_flops => timer_measure_flops + procedure, pass :: measure_allocated_memory => timer_measure_allocated_memory + procedure, pass :: measure_memory_bandwidth => timer_measure_memory_bandwidth + procedure, pass :: set_print_options => timer_set_print_options procedure, pass :: in_entries => timer_in_entries procedure, pass :: get => timer_get procedure, pass :: since => timer_since @@ -113,7 +129,7 @@ module ftimings #ifdef HAVE_LIBPAPI interface - function papi_init() result(ret) bind(C, name="ftimings_papi_init") + function flop_init() result(ret) bind(C, name="ftimings_flop_init") use, intrinsic :: iso_c_binding implicit none integer(kind=C_INT) :: ret @@ -121,12 +137,20 @@ module ftimings end interface interface - function current_flop_count() result(cnt) bind(C, name="ftimings_current_flop_count") + function loads_stores_init() result(ret) bind(C, name="ftimings_loads_stores_init") use, intrinsic :: iso_c_binding implicit none - integer(kind=C_LONG_LONG) :: cnt + integer(kind=C_INT) :: ret end function end interface + + interface + subroutine papi_counters(flops, ldst) bind(C, name="ftimings_papi_counters") + use, intrinsic :: iso_c_binding + implicit none + integer(kind=C_LONG_LONG), intent(out) :: flops, ldst + end subroutine + end interface #endif interface @@ -141,7 +165,6 @@ module ftimings !> Activate the timer, without this, most methods are non-ops. !> - !> \implements timer_t::enable subroutine timer_enable(self) class(timer_t), intent(inout), target :: self @@ -151,65 +174,157 @@ module ftimings !> Call with enabled = .true. to also record amount of newly allocated memory. !> By default, memory usage is not recored. Call with .false. to deactivate again. !> - !> \implements timer_t::measure_memory - subroutine timer_measure_memory(self, enabled) + !> This opens /proc/self/statm, parses it, and closes it agagain and is thus + !> quite costly, use when appropriate. + !> + subroutine timer_measure_allocated_memory(self, enabled) + class(timer_t), intent(inout) :: self + logical, intent(in) :: enabled + + self%record_allocated_memory = enabled + end subroutine + + !> Call with enabled = .true. to also record the memory bandwidth with PAPI + !> By default, this is not recorded. Call with .false. to deactivate again. + !> + subroutine timer_measure_memory_bandwidth(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled - self%record_memory = enabled + if (enabled) then +#ifdef HAVE_LIBPAPI + if (loads_stores_init() == 1) then + self%record_memory_bandwidth = .true. + else + write(0,'(a)') "ftimings: Could not initialize PAPI, disabling memory bandwidth counter" + self%record_memory_bandwidth = .false. + endif +#else + write(0,'(a)') "ftimings: not compiled with PAPI support, disabling memory bandwidth counter" + self%record_memory_bandwidth = .false. +#endif + else + ! explicitly set to .false. by caller + self%record_memory_bandwidth = .false. + endif end subroutine !> Call with enabled = .true. to also record FLOP counts via PAPI calls. !> By default no FLOPS are recored. Call with .false. to deactivate again. !> - !> \implements timer_t::measure_flops subroutine timer_measure_flops(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled if (enabled) then #ifdef HAVE_LIBPAPI -!$OMP CRITICAL - if (.not. have_tried_papi_init) then - have_tried_papi_init = .true. - if (papi_init() == 1) then - papi_supported = .true. - else - papi_supported = .false. - endif - endif -!$OMP END CRITICAL - if (.not. papi_supported) then - write(0,'(a)') "ftimings: Could not activate PAPI, disabling FLOP counter" - self%record_flop_counts = .false. - else + if (flop_init() == 1) then self%record_flop_counts = .true. + else + write(0,'(a)') "ftimings: Could not initialize PAPI, disabling FLOP counter" + self%record_flop_counts = .false. endif #else write(0,'(a)') "ftimings: not compiled with PAPI support, disabling FLOP counter" self%record_flop_counts = .false. #endif else - ! explicitly set to .false. by caller + ! Explicitly set to .false. by caller self%record_flop_counts = .false. endif end subroutine - !> Deactive the timer + !> Deactivate the timer !> - !> \implements timer_t::disable subroutine timer_disable(self) class(timer_t), intent(inout), target :: self self%active = .false. end subroutine - !> \implements timer_t::is_enabled + !> Return whether the timer is currently running + !> function timer_is_enabled(self) result(is) class(timer_t), intent(inout), target :: self logical :: is is = self%active end function + !> Control what to print on following %print calls + !> + !> \param print_allocated_memory Amount of newly allocated, resident memory + !> \param print_flop_count Number of floating point operations + !> \param print_flop_rate Rate of floating point operations per second + !> \param print_ldst Number of loads+stores + !> \param print_memory_bandwidth Rate of loads+stores per second + !> \param print_ai Arithmetic intensity, that is number of + !> floating point operations per number of + !> load and store operations (currently untested) + !> \param bytes_per_ldst For calculating the AI, assume this number + !> of bytes per load or store (default: 8) + subroutine timer_set_print_options(self, & + print_allocated_memory, & + print_flop_count, & + print_flop_rate, & + print_ldst, & + print_memory_bandwidth, & + print_ai, & + bytes_per_ldst) + class(timer_t), intent(inout) :: self + logical, intent(in), optional :: & + print_allocated_memory, & + print_flop_count, & + print_flop_rate, & + print_ldst, & + print_memory_bandwidth, & + print_ai + integer, intent(in), optional :: bytes_per_ldst + + if (present(print_allocated_memory)) then + self%print_allocated_memory = print_allocated_memory + if ((.not. self%record_allocated_memory) .and. self%print_allocated_memory) then + write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!" + endif + endif + + if (present(print_flop_count)) then + self%print_flop_count = print_flop_count + if ((.not. self%record_flop_counts) .and. self%print_flop_count) then + write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" + endif + endif + + if (present(print_flop_rate)) then + self%print_flop_rate = print_flop_rate + if ((.not. self%record_flop_counts) .and. self%print_flop_rate) then + write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" + endif + endif + + if (present(print_ldst)) then + self%print_ldst = print_ldst + if ((.not. self%record_memory_bandwidth) .and. self%print_ldst) then + write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros!" + endif + endif + if (present(print_memory_bandwidth)) then + self%print_memory_bandwidth = print_memory_bandwidth + if ((.not. self%record_memory_bandwidth) .and. self%print_memory_bandwidth) then + write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros for memory bandwidth!" + endif + endif + + if (present(print_ai)) then + self%print_ai = print_ai + if (.not. (self%record_memory_bandwidth .and. self%record_flop_counts)) then + write(0,'(a)') "ftimings: Warning: Memory bandwidth or FLOP counters were disabled, expect invalid values for AI" + endif + endif + + if (present(bytes_per_ldst)) then + self%bytes_per_ldst = bytes_per_ldst + endif + end subroutine + !> Start a timing section !> !> \param name A descriptive name @@ -236,7 +351,6 @@ module ftimings !> call timer%stop("B") !> \endcode !> - !> \implements timer_t::start subroutine timer_start(self, name, replace) class(timer_t), intent(inout), target :: self character(len=*), intent(in) :: name @@ -281,7 +395,7 @@ module ftimings !$ do i = 0, omp_get_level() !$ write(*,*) "omp_get_ancestor_thread_num(", i, ") = ", omp_get_ancestor_thread_num(i) !$ end do - call self%root%print_graph(0, .false., .false., .false.) + call self%root%print_graph(0) !$omp end critical stop "timer_start() while same timer was active" endif @@ -315,7 +429,6 @@ module ftimings !> If given, warns if it does not match the last %start() !> call on stderr and disables the current timer instance. !> - !> \implements timer_t::stop subroutine timer_stop(self, name) class(timer_t), intent(inout), target :: self character(len=*), intent(in), optional :: name @@ -370,7 +483,6 @@ module ftimings !> Deallocate all objects associated with (but not including) self !> - !> \implements timer_t::free subroutine timer_free(self) class(timer_t), intent(inout), target :: self if (associated(self%root)) then @@ -389,21 +501,16 @@ module ftimings !> \param threshold If given, subsume any entries with a value of threshold !> seconds in a single node "(below threshold)" !> \param is_sorted Assume a sorted graph for inserting "(own)" and "(below threshold)" + !> \param unit The unit number on which to print, default stdout !> - !> \implements timer_t::print - subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit, & - print_memory, print_flop_count, print_flop_rate) + subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit) class(timer_t), intent(in), target :: self character(len=*), intent(in), optional :: name1, name2, name3, name4 real(kind=rk), intent(in), optional :: threshold logical, intent(in), optional :: is_sorted integer, intent(in), optional :: unit - logical, intent(in), optional :: print_memory - logical, intent(in), optional :: print_flop_count, print_flop_rate integer :: unit_act - logical :: do_print_memory - logical :: do_print_flop_count, do_print_flop_rate type(node_t), pointer :: node character(len=64) :: format_spec @@ -412,9 +519,13 @@ module ftimings character(len=name_length), parameter :: group = "Group" character(len=12), parameter :: seconds = " [s]" character(len=12), parameter :: fract = " fraction" + character(len=12), parameter :: ram = " alloc. RAM" character(len=12), parameter :: flop_rate = " Mflop/s" character(len=12), parameter :: flop_count = " Mflop" - character(len=12), parameter :: ram = " alloc. RAM" + character(len=12), parameter :: ldst = "loads+stores" + character(len=12), parameter :: bandwidth = " mem bandw." + character(len=12), parameter :: ai = "arithm. Int." + character(len=12), parameter :: dash = "============" if (.not. self%active) then return @@ -456,53 +567,58 @@ module ftimings endif end if - if (present(print_memory)) then - do_print_memory = print_memory - else - do_print_memory = self%record_memory - endif - if ((.not. self%record_memory) .and. do_print_memory) then - write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!" + ! I really do hate it .. + write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length + write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract + + if (self%print_allocated_memory) then + write(unit_act,'(1x,a12)',advance='no') ram endif - if (present(print_flop_count)) then - do_print_flop_count = print_flop_count - else - do_print_flop_count = self%record_flop_counts + if (self%print_flop_count) then + write(unit_act,'(1x,a12)',advance='no') flop_count endif - if ((.not. self%record_flop_counts) .and. do_print_flop_count) then - write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" + if (self%print_flop_rate) then + write(unit_act,'(1x,a12)',advance='no') flop_rate endif - - if (present(print_flop_rate)) then - do_print_flop_rate = print_flop_rate - else - do_print_flop_rate = self%record_flop_counts + if (self%print_ldst) then + write(unit_act,'(1x,a12)',advance='no') ldst endif - if ((.not. self%record_flop_counts) .and. do_print_flop_rate) then - write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" + if (self%print_memory_bandwidth) then + write(unit_act,'(1x,a12)',advance='no') bandwidth + endif + if (self%print_ai) then + write(unit_act,'(1x,a12)',advance='no') ai endif - ! I really do hate it .. - write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length - write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract + write(unit_act,'(a)') "" - if (do_print_memory) then - write(unit_act,'(1x,a12)',advance='no') ram + write(format_spec,'("("" | "",a",i0,",1x,2(1x,a12))")') name_length + write(unit_act, format_spec, advance='no') "", dash, dash + + if (self%print_allocated_memory) then + write(unit_act,'(1x,a12)',advance='no') dash endif - if (do_print_flop_count) then - write(unit_act,'(1x,a12)',advance='no') flop_count + if (self%print_flop_count) then + write(unit_act,'(1x,a12)',advance='no') dash endif - if (do_print_flop_rate) then - write(unit_act,'(1x,a12)',advance='no') flop_rate + if (self%print_flop_rate) then + write(unit_act,'(1x,a12)',advance='no') dash + endif + if (self%print_ldst) then + write(unit_act,'(1x,a12)',advance='no') dash + endif + if (self%print_memory_bandwidth) then + write(unit_act,'(1x,a12)',advance='no') dash + endif + if (self%print_ai) then + write(unit_act,'(1x,a12)',advance='no') dash endif write(unit_act,'(a)') "" - call node%print_graph(0, & - do_print_memory, do_print_flop_count, do_print_flop_rate, & - threshold, is_sorted, unit=unit) + call node%print_graph(0, threshold, is_sorted, unit=unit) end subroutine @@ -517,7 +633,6 @@ module ftimings !> For example timer%in_entries("foo", "bar", "parallel") returns !> the sum of all entries named "parallel" below the foo->bar node !> - !> \implements timer_t::in_entries function timer_in_entries(self, name1, name2, name3, name4) result(s) use, intrinsic :: iso_fortran_env, only : error_unit class(timer_t), intent(in), target :: self @@ -572,7 +687,6 @@ module ftimings !> The result is only meaningfull if the entry was never appended by !> additional %start() calls. !> - !> \implements timer_t::get function timer_get(self, name1, name2, name3, name4, name5, name6) result(s) class(timer_t), intent(in), target :: self ! this is clunky, but what can you do.. @@ -643,7 +757,6 @@ module ftimings !> The result is only meaningfull if the entry was never appended by !> additional %start() calls. !> - !> \implements timer_t::since function timer_since(self, name1, name2, name3, name4) result(s) class(timer_t), intent(in), target :: self character(len=*), intent(in), optional :: name1, name2, name3, name4 @@ -693,7 +806,6 @@ module ftimings !> Sort the graph on each level. !> Warning: This irrevocable destroys the old ordering. !> - !> \implements timer_t::sort subroutine timer_sort(self) class(timer_t), intent(inout), target :: self type(node_t), pointer :: node @@ -721,14 +833,13 @@ module ftimings ! current time val%micros = microseconds_since_epoch() - if (self%timer%record_memory) then + if (self%timer%record_allocated_memory) then val%rsssize = resident_set_size() endif #ifdef HAVE_LIBPAPI - ! flop counter - if (self%timer%record_flop_counts) then - val%flop_count = current_flop_count() + if (self%timer%record_flop_counts .or. self%timer%record_memory_bandwidth) then + call papi_counters(val%flop_count, val%ldst) endif #endif end function @@ -927,12 +1038,9 @@ module ftimings end do end subroutine - subroutine node_print(self, indent_level, & - print_memory, print_flop_count, print_flop_rate, & - total, unit) + subroutine node_print(self, indent_level, total, unit) class(node_t), intent(inout) :: self integer, intent(in) :: indent_level - logical, intent(in) :: print_memory, print_flop_count, print_flop_rate type(value_t), intent(in) :: total type(value_t) :: val integer, intent(in) :: unit @@ -954,18 +1062,13 @@ module ftimings else val = self%value endif - call val%print(indent_level, & - print_memory, print_flop_count, print_flop_rate, & - name, total, unit) + call print_value(val, self%timer, indent_level, name, total, unit) end subroutine - recursive subroutine node_print_graph(self, indent_level, & - print_memory, print_flop_count, print_flop_rate, & - threshold, is_sorted, total, unit) + recursive subroutine node_print_graph(self, indent_level, threshold, is_sorted, total, unit) use, intrinsic :: iso_fortran_env, only : output_unit class(node_t), intent(inout) :: self integer, intent(in) :: indent_level - logical, intent(in) :: print_memory, print_flop_count, print_flop_rate real(kind=rk), intent(in), optional :: threshold logical, intent(in), optional :: is_sorted type(value_t), intent(in), optional :: total @@ -1006,9 +1109,7 @@ module ftimings unit_act = output_unit endif - call self%print(indent_level, & - print_memory, print_flop_count, print_flop_rate, & - total_act, unit_act) + call self%print(indent_level, total_act, unit_act) own_value = cur_value - self%sum_of_children() below_threshold_value = self%sum_of_children_below(threshold) @@ -1024,6 +1125,7 @@ module ftimings allocate(own_node) own_node%value = own_value own_node%name = own + own_node%timer => self%timer call insert_into_sorted_list(self%firstChild, own_node) endif @@ -1032,20 +1134,17 @@ module ftimings allocate(threshold_node) threshold_node%value = below_threshold_value threshold_node%name = below + threshold_node%timer => self%timer call insert_into_sorted_list(self%firstChild, threshold_node) endif else ! print them first if (print_own) then - call own_value%print(indent_level + 1, & - print_memory, print_flop_count, print_flop_rate, & - own, cur_value, unit_act) + call print_value(own_value, self%timer, indent_level + 1, own, cur_value, unit_act) endif if (print_threshold) then - call below_threshold_value%print(indent_level + 1, & - print_memory, print_flop_count, print_flop_rate, & - below, cur_value, unit_act) + call print_value(below_threshold_value, self%timer, indent_level + 1, below, cur_value, unit_act) endif endif @@ -1056,9 +1155,7 @@ module ftimings if (node_value%micros * 1e-6_rk >= threshold_act & .or. associated(node, threshold_node) & .or. associated(node, own_node)) then - call node%print_graph(indent_level + 1, & - print_memory, print_flop_count, print_flop_rate, & - threshold, is_sorted, cur_value, unit_act) + call node%print_graph(indent_level + 1, threshold, is_sorted, cur_value, unit_act) endif node => node%nextSibling end do @@ -1176,4 +1273,69 @@ module ftimings enddo end subroutine + subroutine print_value(value, timer, indent_level, label, total, unit) + type(value_t), intent(in) :: value + type(timer_t), intent(in) :: timer + integer, intent(in) :: indent_level + character(len=name_length), intent(in) :: label + type(value_t), intent(in) :: total + integer, intent(in) :: unit + + character(len=64) :: format_spec + + write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length + write(unit,format_spec,advance='no') & + label, & + real(value%micros, kind=rk) * 1e-6_rk, & + real(value%micros, kind=rk) / real(total%micros, kind=rk) + + if (timer%print_allocated_memory) then + write(unit,'(1x,a12)',advance='no') & + nice_format(real(value%rsssize, kind=C_DOUBLE)) + endif + if (timer%print_flop_count) then + write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / 1e6_rk + endif + if (timer%print_flop_rate) then + write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / value%micros + endif + if (timer%print_ldst) then + write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst, kind=rk)) + endif + if (timer%print_memory_bandwidth) then + write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst*timer%bytes_per_ldst, kind=rk) / (value%micros * 1e-6_rk)) + endif + if (timer%print_ai) then + write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / value%ldst / timer%bytes_per_ldst + endif + + write(unit,'(a)') "" + end subroutine + + pure elemental function nice_format(number) result(string) + real(kind=C_DOUBLE), intent(in) :: number + character(len=12) :: string + real(kind=C_DOUBLE), parameter :: & + kibi = 2.0_C_DOUBLE**10, & + mebi = 2.0_C_DOUBLE**20, & + gibi = 2.0_C_DOUBLE**30, & + tebi = 2.0_C_DOUBLE**40, & + pebi = 2.0_C_DOUBLE**50 + + if (abs(number) >= pebi) then + write(string,'(es12.2)') number + else if (abs(number) >= tebi) then + write(string,'(f9.2,'' Ti'')') number / tebi + else if (abs(number) >= gibi) then + write(string,'(f9.2,'' Gi'')') number / gibi + else if (abs(number) >= mebi) then + write(string,'(f9.2,'' Mi'')') number / mebi + else if (abs(number) >= kibi) then + write(string,'(f9.2,'' ki'')') number / kibi + else + write(string,'(f12.2)') number + endif + end function + + end module diff --git a/ftimings/ftimings_value.F90 b/ftimings/ftimings_value.F90 index 4586be465d962e1c0befe6ba035f3d3e744d3575..d329d7caf7351a719ce2192a9f074bf8214c84a8 100644 --- a/ftimings/ftimings_value.F90 +++ b/ftimings/ftimings_value.F90 @@ -8,11 +8,10 @@ module ftimings_value public type value_t - integer(kind=C_INT64_T) :: micros = 0 ! Cumulative microseconds spent in this node - integer(kind=C_LONG) :: rsssize = 0 - integer(kind=C_LONG_LONG) :: flop_count = 0 ! Cumulative floating point operations done in this node - contains - procedure, pass :: print => print_value + integer(kind=C_INT64_T) :: micros = 0 ! microseconds spent in this node + integer(kind=C_LONG) :: rsssize = 0 ! newly used resident memory + integer(kind=C_LONG_LONG) :: flop_count = 0 ! floating point operations done in this node + integer(kind=C_LONG_LONG) :: ldst = 0 ! number of loads and stores end type interface operator(+) @@ -37,6 +36,7 @@ module ftimings_value c%rsssize = a%rsssize + b%rsssize #ifdef HAVE_LIBPAPI c%flop_count = a%flop_count + b%flop_count + c%ldst = a%ldst + b%ldst #endif end function @@ -47,6 +47,7 @@ module ftimings_value c%rsssize = a%rsssize - b%rsssize #ifdef HAVE_LIBPAPI c%flop_count = a%flop_count - b%flop_count + c%ldst = a%ldst - b%ldst #endif end function @@ -57,65 +58,7 @@ module ftimings_value neg_a%rsssize = - a%rsssize #ifdef HAVE_LIBPAPI neg_a%flop_count = - a%flop_count + neg_a%ldst = - a%ldst #endif end function - - subroutine print_value(self, indent_level, & - print_memory, print_flop_count, print_flop_rate, & - label, total, unit) - class(value_t), intent(in) :: self - integer, intent(in) :: indent_level - logical, intent(in) :: print_memory - logical, intent(in) :: print_flop_count, print_flop_rate - character(len=name_length), intent(in) :: label - type(value_t), intent(in) :: total - integer, intent(in) :: unit - - character(len=64) :: format_spec - - write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length - write(unit,format_spec,advance='no') & - label, & - real(self%micros, kind=rk) * 1e-6_rk, & - real(self%micros, kind=rk) / real(total%micros, kind=rk) - - if (print_memory) then - write(unit,'(1x,a12)',advance='no') & - nice_format(real(self%rsssize, kind=C_DOUBLE)) - endif - if (print_flop_count) then - write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / 1e6_rk - endif - if (print_flop_rate) then - write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / self%micros - endif - - write(unit,'(a)') "" - end subroutine - - pure elemental function nice_format(flops) result(string) - real(kind=C_DOUBLE), intent(in) :: flops - character(len=12) :: string - real(kind=C_DOUBLE), parameter :: & - kibi = 2.0_C_DOUBLE**10, & - mebi = 2.0_C_DOUBLE**20, & - gibi = 2.0_C_DOUBLE**30, & - tebi = 2.0_C_DOUBLE**40, & - pebi = 2.0_C_DOUBLE**50 - - if (abs(flops) >= pebi) then - write(string,'(es12.1)') flops - else if (abs(flops) >= tebi) then - write(string,'(f9.1,'' Ti'')') flops / tebi - else if (abs(flops) >= gibi) then - write(string,'(f9.1,'' Gi'')') flops / gibi - else if (abs(flops) >= mebi) then - write(string,'(f9.1,'' Mi'')') flops / mebi - else if (abs(flops) >= kibi) then - write(string,'(f9.1,'' ki'')') flops / kibi - else - write(string,'(f9.1,'' '')') flops - endif - end function - end module diff --git a/ftimings/papi.c b/ftimings/papi.c index db3d86b4b660f3ce605323a90aff295de333ffdb..9e12c6c89bf05de5f2adcc9ecd649ddcc350ca84 100644 --- a/ftimings/papi.c +++ b/ftimings/papi.c @@ -5,51 +5,142 @@ #include "config.h" #endif -static int flops_event_set; +static int event_set; + +static int tried_papi_init = 0; +static int papi_available = 0; +static int flops_available = 0; +static int ldst_available = 0; #ifdef HAVE_LIBPAPI #include <papi.h> int ftimings_papi_init(void) { int ret; - flops_event_set = PAPI_NULL; - if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) { - fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n", - __FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret)); - return 0; - } - if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) { - fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n", - __FILE__, __LINE__, PAPI_strerror(ret)); - return 0; + if (tried_papi_init) { + return papi_available; } - if ((ret = PAPI_create_eventset(&flops_event_set)) < 0) { - fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n", - __FILE__, __LINE__, PAPI_strerror(ret)); - return 0; - } - if ((ret = PAPI_add_event(flops_event_set, PAPI_DP_OPS)) < 0) { - fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n", - __FILE__, __LINE__, PAPI_strerror(ret)); - return 0; - } - if ((ret = PAPI_start(flops_event_set)) < 0) { - fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n", - __FILE__, __LINE__, PAPI_strerror(ret)); - return 0; +#pragma omp critical + { + /* Think about it :) */ + if (tried_papi_init) { + goto end; + } + + tried_papi_init = 1; + + event_set = PAPI_NULL; + + if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) { + fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n", + __FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret)); + goto error; + } + + if ((ret = PAPI_create_eventset(&event_set)) < 0) { + fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + goto error; + } + + /* Check FLOP counter availability */ + if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) { + fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + flops_available = 0; + } else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) { + fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + flops_available = 0; + } else { + flops_available = 1; + } + + /* Loads + Stores */ + if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) { + fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + ldst_available = 0; + } else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) { + fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + ldst_available = 0; + } else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) { + fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + ldst_available = 0; + } else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) { + fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + ldst_available = 0; + } else { + ldst_available = 1; + } + + /* Start */ + if ((ret = PAPI_start(event_set)) < 0) { + fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n", + __FILE__, __LINE__, PAPI_strerror(ret)); + goto error; + } + + goto end; + +error: + /* PAPI works */ + papi_available = 0; + +end: + /* PAPI works */ + papi_available = 1; + + } /* End of critical region */ + + return papi_available; +} + +int ftimings_flop_init(void) { + int ret; + + if (!tried_papi_init) { + ftimings_papi_init(); } - return 1; + + return flops_available; } -long long ftimings_current_flop_count(void) { - long long count; +int ftimings_loads_stores_init(void) { int ret; - if ((ret = PAPI_read(flops_event_set, &count)) < 0) { + + if (!tried_papi_init) { + ftimings_papi_init(); + } + + return ldst_available; +} + +void ftimings_papi_counters(long long *flops, long long *ldst) { + long long res[3]; + int i, ret; + + if ((ret = PAPI_read(event_set, &res[0])) < 0) { fprintf(stderr, "PAPI_read: %s\n", PAPI_strerror(ret)); exit(1); } - return count; + + i = 0; + if (flops_available) { + *flops = res[i++]; + } else { + *flops = 0LL; + } + if (ldst_available) { + *ldst = res[i++]; + *ldst += res[i++]; + } else { + *ldst = 0LL; + } } #endif diff --git a/ftimings/resident_set_size.c b/ftimings/resident_set_size.c index 5ce519f8719c780316d4d2cdaf18ca44aea939fc..fc395b0e0fd94b751c08f78cc1bd4ce105698db5 100644 --- a/ftimings/resident_set_size.c +++ b/ftimings/resident_set_size.c @@ -1,18 +1,7 @@ #include <stdio.h> #include <unistd.h> -/*#include <sys/time.h> -#include <sys/resource.h>*/ - long ftimings_resident_set_size() { -/* struct rusage usage; - if (getrusage(RUSAGE_SELF, &usage) != 0) { - perror("getrusage"); - exit(1); - } - return usage.ru_maxrss; -*/ - long rss = 0L; FILE* fp = NULL; if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) { diff --git a/test/do_flops.c b/test/do_flops.c index 31701ef665dc511158402517eb137cb003cb8029..ab6f60cc24cf05904a4e6dc64247f313455ee0b9 100644 --- a/test/do_flops.c +++ b/test/do_flops.c @@ -1,11 +1,14 @@ -volatile double a = 0.25; +#define N 1048576L -double megaflop(void) { +double a[N]; +double b[N]; +double c[N]; +double d[N]; + +void vector_triad(void) { int i; - double c = 0.5; - for (i=0; i < 500000; i++) { - c = c * a + a; + for (i=0; i < N; i++) { + a[i] = b[i] + c[i] * d[i]; } - return c; } diff --git a/test/test_timings.F90 b/test/test_timings.F90 index 7d216defe91361ebadfff1783fdb262d558c2449..d253b7f82e34d452d90b4bb2a42d4db67462ca86 100644 --- a/test/test_timings.F90 +++ b/test/test_timings.F90 @@ -6,7 +6,14 @@ program test_timings integer :: i, j call timer%measure_flops(.true.) - call timer%measure_memory(.true.) + call timer%measure_allocated_memory(.true.) + call timer%measure_memory_bandwidth(.true.) + + call timer%set_print_options(& + print_flop_count=.true., & + print_flop_rate=.true., & + print_memory_bandwidth=.true., & + print_ai=.true., bytes_per_ldst=16) call timer%enable() @@ -40,16 +47,16 @@ program test_timings call timer%print("program") else ! usual printing of current subtree - call timer%print("program", "cycle", print_flop_count=.true.) + call timer%print("program", "cycle") endif write(*,*) - write(*,'(a,f9.6)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle") - write(*,'(a,f9.6)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle") - write(*,'(a,f9.6)') " cycle total : ", timer%get("program", "cycle") + write(*,'(a,f12.6)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle") + write(*,'(a,f12.6)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle") + write(*,'(a,f12.6)') " cycle total : ", timer%get("program", "cycle") #ifndef _OPENMP - write(*,'(a,f9.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c") + write(*,'(a,f12.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c") #else - write(*,'(a,f9.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c") + write(*,'(a,f12.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c") #endif write(*,*) endif @@ -84,6 +91,7 @@ program test_timings write(*,*) write(*,*) "Whole tree:" + call timer%set_print_options(print_ldst=.true.) call timer%print(is_sorted=.true.) call timer%free() @@ -111,12 +119,14 @@ program test_timings subroutine c() interface - subroutine megaflop() bind(C, name="megaflop") + subroutine vector_triad() bind(C, name="vector_triad") end subroutine end interface integer :: i call timer%start("c") - call megaflop() - call timer%stop("c") + call timer%start("2.097 Mflop, AI=0.0625") + call vector_triad() + call timer%stop() + call timer%stop() end subroutine end program