Commit 803a3959 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Counter for memory bandwidth (loads + stores)

Additionally one can now also measure load and stores, and thus the
memory bandwidth. Therefore, also the arithmetic intensity.

One caveat, though: The user is responsible to provide a meaningful
value for the amount of bytes transferred in one load/store, via the
"bytes_per_ldsr" parameter of the new function %set_print_options.

Till now, I have now way of obtaining this value programmatically, and
it also can and will vary for different sections of a program.

For example, a SSE movapd instructions loads/stores 16 byte, but is
still counted as one "load and store" instruction, just as well as a
1-byte mov. Feel free to advise me on a better set of machine counters..

Also, somewhat updated documentation.
parent d32bbcb3
......@@ -733,7 +733,7 @@ WARN_LOGFILE =
# spaces.
# Note: If this tag is empty the current directory is searched.
INPUT = ftimings/
INPUT = @SRCDIR@/ftimings/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
......
......@@ -138,7 +138,7 @@ doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
rm -rf @DX_DOCDIR@
$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
$(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG)
DX_CLEANFILES = \
@DX_DOCDIR@/@PACKAGE@.tag \
......
This diff is collapsed.
......@@ -8,11 +8,10 @@ module ftimings_value
public
type value_t
integer(kind=C_INT64_T) :: micros = 0 ! Cumulative microseconds spent in this node
integer(kind=C_LONG) :: rsssize = 0
integer(kind=C_LONG_LONG) :: flop_count = 0 ! Cumulative floating point operations done in this node
contains
procedure, pass :: print => print_value
integer(kind=C_INT64_T) :: micros = 0 ! microseconds spent in this node
integer(kind=C_LONG) :: rsssize = 0 ! newly used resident memory
integer(kind=C_LONG_LONG) :: flop_count = 0 ! floating point operations done in this node
integer(kind=C_LONG_LONG) :: ldst = 0 ! number of loads and stores
end type
interface operator(+)
......@@ -37,6 +36,7 @@ module ftimings_value
c%rsssize = a%rsssize + b%rsssize
#ifdef HAVE_LIBPAPI
c%flop_count = a%flop_count + b%flop_count
c%ldst = a%ldst + b%ldst
#endif
end function
......@@ -47,6 +47,7 @@ module ftimings_value
c%rsssize = a%rsssize - b%rsssize
#ifdef HAVE_LIBPAPI
c%flop_count = a%flop_count - b%flop_count
c%ldst = a%ldst - b%ldst
#endif
end function
......@@ -57,65 +58,7 @@ module ftimings_value
neg_a%rsssize = - a%rsssize
#ifdef HAVE_LIBPAPI
neg_a%flop_count = - a%flop_count
neg_a%ldst = - a%ldst
#endif
end function
subroutine print_value(self, indent_level, &
print_memory, print_flop_count, print_flop_rate, &
label, total, unit)
class(value_t), intent(in) :: self
integer, intent(in) :: indent_level
logical, intent(in) :: print_memory
logical, intent(in) :: print_flop_count, print_flop_rate
character(len=name_length), intent(in) :: label
type(value_t), intent(in) :: total
integer, intent(in) :: unit
character(len=64) :: format_spec
write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length
write(unit,format_spec,advance='no') &
label, &
real(self%micros, kind=rk) * 1e-6_rk, &
real(self%micros, kind=rk) / real(total%micros, kind=rk)
if (print_memory) then
write(unit,'(1x,a12)',advance='no') &
nice_format(real(self%rsssize, kind=C_DOUBLE))
endif
if (print_flop_count) then
write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / 1e6_rk
endif
if (print_flop_rate) then
write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / self%micros
endif
write(unit,'(a)') ""
end subroutine
pure elemental function nice_format(flops) result(string)
real(kind=C_DOUBLE), intent(in) :: flops
character(len=12) :: string
real(kind=C_DOUBLE), parameter :: &
kibi = 2.0_C_DOUBLE**10, &
mebi = 2.0_C_DOUBLE**20, &
gibi = 2.0_C_DOUBLE**30, &
tebi = 2.0_C_DOUBLE**40, &
pebi = 2.0_C_DOUBLE**50
if (abs(flops) >= pebi) then
write(string,'(es12.1)') flops
else if (abs(flops) >= tebi) then
write(string,'(f9.1,'' Ti'')') flops / tebi
else if (abs(flops) >= gibi) then
write(string,'(f9.1,'' Gi'')') flops / gibi
else if (abs(flops) >= mebi) then
write(string,'(f9.1,'' Mi'')') flops / mebi
else if (abs(flops) >= kibi) then
write(string,'(f9.1,'' ki'')') flops / kibi
else
write(string,'(f9.1,'' '')') flops
endif
end function
end module
......@@ -5,51 +5,142 @@
#include "config.h"
#endif
static int flops_event_set;
static int event_set;
static int tried_papi_init = 0;
static int papi_available = 0;
static int flops_available = 0;
static int ldst_available = 0;
#ifdef HAVE_LIBPAPI
#include <papi.h>
int ftimings_papi_init(void) {
int ret;
flops_event_set = PAPI_NULL;
if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
return 0;
}
if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
return 0;
if (tried_papi_init) {
return papi_available;
}
if ((ret = PAPI_create_eventset(&flops_event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
return 0;
}
if ((ret = PAPI_add_event(flops_event_set, PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
return 0;
}
if ((ret = PAPI_start(flops_event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
return 0;
#pragma omp critical
{
/* Think about it :) */
if (tried_papi_init) {
goto end;
}
tried_papi_init = 1;
event_set = PAPI_NULL;
if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
goto error;
}
if ((ret = PAPI_create_eventset(&event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
goto error;
}
/* Check FLOP counter availability */
if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
flops_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
flops_available = 0;
} else {
flops_available = 1;
}
/* Loads + Stores */
if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else {
ldst_available = 1;
}
/* Start */
if ((ret = PAPI_start(event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
goto error;
}
goto end;
error:
/* PAPI works */
papi_available = 0;
end:
/* PAPI works */
papi_available = 1;
} /* End of critical region */
return papi_available;
}
int ftimings_flop_init(void) {
int ret;
if (!tried_papi_init) {
ftimings_papi_init();
}
return 1;
return flops_available;
}
long long ftimings_current_flop_count(void) {
long long count;
int ftimings_loads_stores_init(void) {
int ret;
if ((ret = PAPI_read(flops_event_set, &count)) < 0) {
if (!tried_papi_init) {
ftimings_papi_init();
}
return ldst_available;
}
void ftimings_papi_counters(long long *flops, long long *ldst) {
long long res[3];
int i, ret;
if ((ret = PAPI_read(event_set, &res[0])) < 0) {
fprintf(stderr, "PAPI_read: %s\n", PAPI_strerror(ret));
exit(1);
}
return count;
i = 0;
if (flops_available) {
*flops = res[i++];
} else {
*flops = 0LL;
}
if (ldst_available) {
*ldst = res[i++];
*ldst += res[i++];
} else {
*ldst = 0LL;
}
}
#endif
#include <stdio.h>
#include <unistd.h>
/*#include <sys/time.h>
#include <sys/resource.h>*/
long ftimings_resident_set_size() {
/* struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) != 0) {
perror("getrusage");
exit(1);
}
return usage.ru_maxrss;
*/
long rss = 0L;
FILE* fp = NULL;
if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) {
......
volatile double a = 0.25;
#define N 1048576L
double megaflop(void) {
double a[N];
double b[N];
double c[N];
double d[N];
void vector_triad(void) {
int i;
double c = 0.5;
for (i=0; i < 500000; i++) {
c = c * a + a;
for (i=0; i < N; i++) {
a[i] = b[i] + c[i] * d[i];
}
return c;
}
......@@ -6,7 +6,14 @@ program test_timings
integer :: i, j
call timer%measure_flops(.true.)
call timer%measure_memory(.true.)
call timer%measure_allocated_memory(.true.)
call timer%measure_memory_bandwidth(.true.)
call timer%set_print_options(&
print_flop_count=.true., &
print_flop_rate=.true., &
print_memory_bandwidth=.true., &
print_ai=.true., bytes_per_ldst=16)
call timer%enable()
......@@ -40,16 +47,16 @@ program test_timings
call timer%print("program")
else
! usual printing of current subtree
call timer%print("program", "cycle", print_flop_count=.true.)
call timer%print("program", "cycle")
endif
write(*,*)
write(*,'(a,f9.6)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle")
write(*,'(a,f9.6)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle")
write(*,'(a,f9.6)') " cycle total : ", timer%get("program", "cycle")
write(*,'(a,f12.6)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle")
write(*,'(a,f12.6)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle")
write(*,'(a,f12.6)') " cycle total : ", timer%get("program", "cycle")
#ifndef _OPENMP
write(*,'(a,f9.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c")
write(*,'(a,f12.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c")
#else
write(*,'(a,f9.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c")
write(*,'(a,f12.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c")
#endif
write(*,*)
endif
......@@ -84,6 +91,7 @@ program test_timings
write(*,*)
write(*,*) "Whole tree:"
call timer%set_print_options(print_ldst=.true.)
call timer%print(is_sorted=.true.)
call timer%free()
......@@ -111,12 +119,14 @@ program test_timings
subroutine c()
interface
subroutine megaflop() bind(C, name="megaflop")
subroutine vector_triad() bind(C, name="vector_triad")
end subroutine
end interface
integer :: i
call timer%start("c")
call megaflop()
call timer%stop("c")
call timer%start("2.097 Mflop, AI=0.0625")
call vector_triad()
call timer%stop()
call timer%stop()
end subroutine
end program
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment