Commit c40d388d authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

PAPI support for FLOP counts

Next so some refactoring into four separate source files, support for
also recording values of perfomance counters via the PAPI library was
added, at the moment a FLOP count is measured and results are presented
in timer_print as MFlop/s.
parent 50aa8de1
......@@ -4,12 +4,14 @@
.deps
.dirstamp
.libs
Doxyfile
Makefile
Makefile.in
aclocal.m4
ar-lib
autom4te.cache/
compile
config-f90.h
config.guess
config.h
config.h.in
......@@ -19,14 +21,13 @@ config.status
config.sub
configure
depcomp
docs/
ftimings-*-*.pc
ftimings.mod
ftimings_*_test
install-sh
libtool
ltmain.sh
m4/
missing
stamp-h1
ftimings-*-*.pc
ftimings_*_test
ftimings.mod
docs/
Doxyfile
......@@ -10,6 +10,10 @@ lib_LTLIBRARIES = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
# libftimings
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES = \
ftimings/time.c \
ftimings/papi.c \
ftimings/ftimings_type.F90 \
ftimings/ftimings_value.F90 \
ftimings/ftimings_node.F90 \
ftimings/ftimings.F90
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_LDFLAGS = -version-info $(FTIMINGS_SO_VERSION)
......@@ -26,9 +30,13 @@ bin_PROGRAMS = ftimings_@FC@_test
# test
ftimings_@FC@_test_SOURCES = test/test_timings.F90
ftimings_@FC@_test_LDADD = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
ftimings_@FC@_test_LDFLAGS = -static
# other files to distribute
filesdir = $(datadir)/@PACKAGE@-@FC@/examples
files_DATA = test/test_timings.F90
include doxygen.am
clean-local:
rm *.mod
......@@ -11,6 +11,23 @@ AC_LANG([C])
AM_PROG_CC_C_O
AC_OPENMP
AC_ARG_ENABLE([papi],
[AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count, autodetected by default])],
[want_papi=$enableval],[want_papi="auto"])
papi_found=unknown
if test x"$want_papi" != x"no" ; then
AC_CHECK_LIB([papi],[PAPI_library_init],[papi_found="yes"],[papi_found="no"])
if test x"$want_papi" = x"yes" ; then
if test x"$papi_found" = x"no" ; then
AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"])
fi
fi
fi
if test x"$papi_found" = x"yes"; then
AC_DEFINE([HAVE_LIBPAPI], [1], [Use the PAPI library])
LIBS="-lpapi $LIBS"
fi
AC_LANG([Fortran])
AC_PROG_FC
AC_FC_LIBRARY_LDFLAGS
......@@ -19,6 +36,8 @@ AC_OPENMP
LT_INIT
AC_ENABLE_SHARED
AC_ENABLE_STATIC
PKG_PROG_PKG_CONFIG
......@@ -40,3 +59,5 @@ AC_CONFIG_FILES([Makefile
ftimings-${FTIMINGS_API_VERSION}-${FC}.pc:ftimings.pc.in
])
AC_OUTPUT
grep "^#define" config.h > config-f90.h
This diff is collapsed.
module ftimings_node
use ftimings_type
use ftimings_value
implicit none
! Private type node_t, representing a graph node
!
type node_t
character(len=name_length) :: name ! Descriptive name, used when printing the timings
integer :: count = 0 ! Number of node_stop calls
type(value_t) :: value ! The actual counter data, see ftimings_values.F90
logical :: is_running = .false. ! .true. if still running
type(node_t), pointer :: firstChild => NULL()
type(node_t), pointer :: lastChild => NULL()
type(node_t), pointer :: parent => NULL()
type(node_t), pointer :: nextSibling => NULL()
contains
procedure, pass :: start => node_start
procedure, pass :: stop => node_stop
procedure, pass :: new_child => node_new_child
procedure, pass :: get_child => node_get_child
procedure, pass :: get_value => node_get_value
procedure, pass :: sum_of_children => node_sum_of_children
procedure, pass :: sum_of_children_with_name => node_sum_of_children_with_name
procedure, pass :: sum_of_children_below => node_sum_of_children_below
procedure, pass :: print => node_print
procedure, pass :: print_graph => node_print_graph
procedure, pass :: sort_children => node_sort_children
end type
character(len=name_length), private, parameter :: own = "(own)"
character(len=name_length), private, parameter :: below = "(below threshold)"
contains
subroutine node_start(self)
class(node_t), intent(inout) :: self
! take the time
self%value = self%value - now()
self%is_running = .true.
end subroutine
subroutine node_stop(self)
class(node_t), intent(inout) :: self
self%count = self%count + 1
! take the time
self%value = self%value + now()
self%is_running = .false.
end subroutine
function node_new_child(self, name) result(new)
class(node_t), intent(inout), target :: self
character(len=*), intent(in) :: name
type(node_t), pointer :: new
if (.not. associated(self%lastChild)) then
allocate(self%lastChild)
new => self%lastChild
self%firstChild => new
else
allocate(self%lastChild%nextSibling)
new => self%lastChild%nextSibling
self%lastChild => new
endif
new%name = name
new%count = 0
select type (self)
type is (node_t)
new%parent => self
class default
stop "node_new_child(): This should not happen"
end select
nullify(new%firstChild)
nullify(new%lastChild)
nullify(new%nextSibling)
end function
function string_eq(str1, str2) result(eq)
character(len=name_length), intent(in) :: str1
character(len=*), intent(in) :: str2
logical :: eq
eq = trim(str1) .eq. str2(1:min(len(trim(str2)), name_length))
end function
function node_get_child(self, name) result(child)
class(node_t), intent(in) :: self
character(len=*), intent(in) :: name
type(node_t), pointer :: child
child => self%firstChild
do while (associated(child))
if (string_eq(child%name, name)) then
return
endif
child => child%nextSibling
enddo
nullify(child)
end function
function node_get_value(self) result(val)
class(node_t), intent(in) :: self
type(value_t) :: val
val = self%value
if (self%is_running) then
! we have not finished, give time up to NOW
val = val + now()
endif
end function
recursive subroutine deallocate_node(entry)
type(node_t), intent(inout), pointer :: entry
type(node_t), pointer :: nextSibling
if (associated(entry%firstChild)) then
call deallocate_node(entry%firstChild)
endif
nextSibling => entry%nextSibling
deallocate(entry)
nullify(entry)
if (associated(nextSibling)) then
call deallocate_node(nextSibling)
endif
end subroutine
function node_sum_of_children(self) result(sum_time)
class(node_t), intent(in) :: self
type(node_t), pointer :: cur_entry
type(value_t) :: sum_time
cur_entry => self%firstChild
do while (associated(cur_entry))
sum_time = sum_time + cur_entry%get_value()
cur_entry => cur_entry%nextSibling
enddo
end function
recursive function node_sum_of_children_with_name(self, name) result(sum_time)
class(node_t), intent(in) :: self
character(len=*), intent(in) :: name
type(node_t), pointer :: cur_entry
type(value_t) :: sum_time
cur_entry => self%firstChild
do while (associated(cur_entry))
if (string_eq(cur_entry%name, name)) then
sum_time = sum_time + cur_entry%value
else
sum_time = sum_time + cur_entry%sum_of_children_with_name(name)
endif
cur_entry => cur_entry%nextSibling
enddo
end function
function node_sum_of_children_below(self, threshold) result(sum_time)
class(node_t), intent(in) :: self
real(kind=rk), intent(in), optional :: threshold
type(node_t), pointer :: cur_entry
type(value_t) :: sum_time, cur_value
if (.not. present(threshold)) then
return
endif
cur_entry => self%firstChild
do while (associated(cur_entry))
cur_value = cur_entry%get_value()
if (cur_value%micros * 1e-6_rk < threshold) then
sum_time = sum_time + cur_value
endif
cur_entry => cur_entry%nextSibling
enddo
end function
subroutine insert_into_sorted_list(head, node)
type(node_t), pointer, intent(inout) :: head
type(node_t), target, intent(inout) :: node
type(node_t), pointer :: cur
if (node%value%micros >= head%value%micros) then
node%nextSibling => head
head => node
return
endif
cur => head
do while (associated(cur%nextSibling))
if (cur%value%micros > node%value%micros .and. node%value%micros >= cur%nextSibling%value%micros) then
node%nextSibling => cur%nextSibling
cur%nextSibling => node
return
endif
cur => cur%nextSibling
end do
! node has to be appended at the end
cur%nextSibling => node
node%nextSibling => NULL()
end subroutine
subroutine remove_from_list(head, node)
type(node_t), pointer, intent(inout) :: head
type(node_t), pointer, intent(in) :: node
type(node_t), pointer :: cur
if (associated(head,node)) then
head => head%nextSibling
return
endif
cur => head
do while (associated(cur%nextSibling))
if (associated(cur%nextSibling,node)) then
cur%nextSibling => cur%nextSibling%nextSibling
return
endif
cur => cur%nextSibling
end do
end subroutine
subroutine node_print(self, indent_level, total, unit)
class(node_t), intent(inout) :: self
integer, intent(in) :: indent_level
type(value_t), intent(in) :: total
type(value_t) :: val
integer, intent(in) :: unit
character(len=name_length) :: name, suffix
if (self%is_running) then
name = trim(self%name) // " (running)"
else
name = self%name
endif
if (self%count > 1) then
write(suffix, '(" (",i0,"x)")') self%count
name = trim(name) // " " // trim(suffix)
endif
if (self%is_running) then
val = self%value + now()
else
val = self%value
endif
call val%print(indent_level, name, total, unit)
end subroutine
recursive subroutine node_print_graph(self, indent_level, threshold, is_sorted, total, unit)
use, intrinsic :: iso_fortran_env, only : output_unit
class(node_t), intent(inout) :: self
integer, intent(in) :: indent_level
real(kind=rk), intent(in), optional :: threshold
logical, intent(in), optional :: is_sorted
type(value_t), intent(in), optional :: total
integer, intent(in), optional :: unit
type(node_t), pointer :: node
integer :: i
type(value_t) :: cur_value, node_value, own_value, below_threshold_value, total_act
type(node_t), target :: own_node, threshold_node
real(kind=rk) :: threshold_act
logical :: is_sorted_act, print_own, print_threshold
integer :: unit_act
if (present(threshold)) then
threshold_act = threshold
else
threshold_act = 0
endif
if (present(is_sorted)) then
is_sorted_act = is_sorted
else
is_sorted_act = .false.
endif
cur_value = self%get_value()
if (present(total)) then
total_act = total
else
total_act = cur_value
endif
if (present(unit)) then
unit_act = unit
else
unit_act = output_unit
endif
call self%print(indent_level, total_act, unit_act)
own_value = cur_value - self%sum_of_children()
below_threshold_value = self%sum_of_children_below(threshold)
print_own = associated(self%firstChild)
print_threshold = below_threshold_value%micros > 0
! Deal with "(own)" and "(below threshold)" entries
if (is_sorted_act) then
! sort them in
if (print_own) then
! insert an "(own)" node
own_node%value = own_value
own_node%name = own
call insert_into_sorted_list(self%firstChild, own_node)
endif
if (print_threshold) then
! insert a "(below threshold)" node
threshold_node%value = below_threshold_value
threshold_node%name = below
call insert_into_sorted_list(self%firstChild, threshold_node)
endif
else
! print them first
if (print_own) then
call own_value%print(indent_level + 1, own, cur_value, unit_act)
endif
if (print_threshold) then
call below_threshold_value%print(indent_level + 1, below, cur_value, unit_act)
endif
endif
! print children
node => self%firstChild
do while (associated(node))
node_value = node%get_value()
if (node_value%micros * 1e-6_rk > threshold_act &
.or. associated(node, threshold_node) &
.or. associated(node, own_node)) then
call node%print_graph(indent_level + 1, threshold, is_sorted, cur_value, unit_act)
endif
node => node%nextSibling
end do
if (is_sorted_act) then
! remove inserted dummy nodes again
if (print_own) then
call remove_from_list(self%firstChild, own_node)
endif
if (print_threshold) then
call remove_from_list(self%firstChild, threshold_node)
endif
endif
end subroutine
! In-place sort a node_t linked list and return the first and last element,
subroutine sort_nodes(head, tail)
type(node_t), pointer, intent(inout) :: head, tail
type(node_t), pointer :: p, q, e
type(value_t) :: p_val, q_val
integer :: insize, nmerges, psize, qsize, i
if (.not. associated(head)) then
nullify(tail)
return
endif
insize = 1
do while (.true.)
p => head
nullify(head)
nullify(tail)
nmerges = 0
do while(associated(p))
nmerges = nmerges + 1
q => p
psize = 0
do i = 1, insize
psize = psize + 1
q => q%nextSibling
if (.not. associated(q)) then
exit
endif
end do
qsize = insize
do while (psize > 0 .or. (qsize > 0 .and. associated(q)))
if (psize == 0) then
e => q
q => q%nextSibling
qsize = qsize - 1
else if (qsize == 0 .or. (.not. associated(q))) then
e => p;
p => p%nextSibling
psize = psize - 1
else
p_val = p%get_value()
q_val = q%get_value()
if (p_val%micros >= q_val%micros) then
e => p
p => p%nextSibling
psize = psize - 1
else
e => q
q => q%nextSibling
qsize = qsize - 1
end if
end if
if (associated(tail)) then
tail%nextSibling => e
else
head => e
endif
tail => e
end do
p => q
end do
nullify(tail%nextSibling)
if (nmerges <= 1) then
return
endif
insize = insize * 2
end do
end subroutine
recursive subroutine node_sort_children(self)
class(node_t), intent(inout) :: self
type(node_t), pointer :: node
call sort_nodes(self%firstChild, self%lastChild)
node => self%firstChild
do while (associated(node))
call node%sort_children()
node => node%nextSibling
enddo
end subroutine
end module
module ftimings_type
use, intrinsic :: iso_c_binding, only : C_INT64_T, C_DOUBLE, C_LONG_LONG
implicit none
integer, parameter :: rk = C_DOUBLE
integer, parameter :: name_length = 40
end module
#ifdef HAVE_CONFIG_H
#include "config-f90.h"
#endif
module ftimings_value
use ftimings_type
implicit none
public
interface
function microseconds_since_epoch() result(us) bind(C, name="ftimings_microseconds_since_epoch")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_INT64_T) :: us
end function
end interface
#ifdef HAVE_LIBPAPI
logical :: papi_supported = .false.
interface
function current_flop_count() result(cnt) bind(C, name="ftimings_current_flop_count")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_LONG_LONG) :: cnt
end function
end interface
#endif
type value_t
integer(kind=C_INT64_T) :: micros = 0 ! Cumulative microseconds spent in this node
#ifdef HAVE_LIBPAPI
integer(kind=C_LONG_LONG) :: flop_count = 0 ! Cumulative floating point operations done in this node
#endif
contains
procedure, pass :: print => print_value
end type