Commit 49797bea authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Measure RAM access with Linux perf API

There is currently no reliable way to measure RAM accesses with PAPI,
the previous way by counting load and store instructions is not very
useful, as it is unknown how many bytes are transferred in each
instruction.

On certain CPUs there is a reliable way to measure this via an "uncore"
performance counter, one can check if your CPU (and/or Linux kernel
version) support this by checking if the files

	/sys/devices/uncore_imc/events/data_reads
	/sys/devices/uncore_imc/events/data_writes

exist.

To access these counter from an unprivileged program one has to set the
"paranoia" level of the perf subsystem to at most 0, adjustable via the
file

	/proc/sys/kernel/perf_event_paranoid

Along with this change there is a small API/ABI breakage as some keyword
arguments related to the memory measurement have been renamed/split-up.
parent d6e80973
......@@ -10,13 +10,23 @@ lib_LTLIBRARIES = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
# libftimings
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES = \
ftimings/time.c \
ftimings/papi.c \
ftimings/resident_set_size.c \
ftimings/virtual_memory.c \
ftimings/highwater_mark.c \
ftimings/ftimings_type.F90 \
ftimings/ftimings_value.F90 \
ftimings/ftimings.F90
if HAVE_LIBPAPI
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES += \
ftimings/papi.c
endif
if HAVE_PERF
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES += \
ftimings/perf_memory_counter.c
endif
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_LDFLAGS = -version-info $(FTIMINGS_SO_VERSION)
ftimings_includedir = $(includedir)/ftimings-$(FTIMINGS_API_VERSION)-$(FC)
......@@ -32,10 +42,21 @@ bin_PROGRAMS = ftimings_@FC@_test
# test
ftimings_@FC@_test_SOURCES = \
test/test_timings.F90 \
test/do_flops.c
test/do_flops.c \
test/do_memory_transfer.c
ftimings_@FC@_test_LDADD = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
ftimings_@FC@_test_LDFLAGS = -static
noinst_PROGRAMS = do_flops do_memory_transfer
do_flops_SOURCES = \
test/do_flops.c
do_flops_CPPFLAGS = -DTEST_DO_FLOPS
do_memory_transfer_SOURCES = \
test/do_memory_transfer.c
do_memory_transfer_CPPFLAGS = -DTEST_DO_MEMORY_TRANSFER
# other files to distribute
filesdir = $(datadir)/@PACKAGE@-@FC@/examples
files_DATA = test/test_timings.F90
......
......@@ -12,21 +12,29 @@ AM_PROG_CC_C_O
AC_OPENMP
AC_ARG_ENABLE([papi],
[AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count, autodetected by default])],
[want_papi=$enableval],[want_papi="auto"])
papi_found=unknown
if test x"$want_papi" != x"no" ; then
[AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count])],
[want_papi=$enableval],[want_papi="yes"])
if test "$want_papi" = "yes" ; then
AC_CHECK_LIB([papi],[PAPI_library_init],[papi_found="yes"],[papi_found="no"])
if test x"$want_papi" = x"yes" ; then
if test x"$papi_found" = x"no" ; then
AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"])
fi
if test "$papi_found" = "no" ; then
AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"])
fi
fi
if test x"$papi_found" = x"yes"; then
AC_DEFINE([HAVE_LIBPAPI], [1], [Use the PAPI library])
LIBS="-lpapi $LIBS"
fi
AM_CONDITIONAL([HAVE_LIBPAPI],[test "$want_papi" = "yes"])
AC_ARG_ENABLE([perf],
[AS_HELP_STRING([--disable-perf],[Do not use the Linux perf API to measure amount of DRAM memory accesses])],
[want_perf=$enableval],[want_perf="yes"])
if test "$want_perf" = "yes" ; then
AC_CHECK_HEADERS([linux/perf_event.h],[perf_found="yes"],[perf_found="no"])
if test "$perf_found" = "no" ; then
AC_MSG_ERROR(["Could not find linux/perf_event.h, please adjust CPPFLAGS"])
fi
AC_DEFINE([HAVE_PERF], [1], [Use Linux perf API])
fi
AM_CONDITIONAL([HAVE_PERF],[test "$want_perf" = "yes"])
AC_LANG([Fortran])
AC_PROG_FC
......@@ -47,7 +55,7 @@ DX_MAN_FEATURE(ON)
DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ftimings], [Doxyfile], [docs])
AC_SUBST([FTIMINGS_SO_VERSION], [0:1:0])
AC_SUBST([FTIMINGS_SO_VERSION], [1:0:0])
AC_SUBST([FTIMINGS_API_VERSION], [0.1])
AC_SUBST([AM_CFLAGS])
AC_SUBST([AM_FCFLAGS])
......
......@@ -83,17 +83,16 @@ module ftimings
logical, private :: record_virtual_memory = .false. !< IF set to .true., record also the virtual memory
logical, private :: record_max_allocated_memory = .false. !< IF set to .true., record also the max resident set size ("high water mark")
logical, private :: record_flop_counts = .false. !< If set to .true., record also FLOP counts via PAPI calls
logical, private :: record_memory_bandwidth = .false. !< If set to .true., record also FLOP counts via PAPI calls
logical, private :: record_memory_bandwidth = .false. !< If set to .true., record also memory bandwidth via PAPI calls
logical, private :: print_allocated_memory = .false.
logical, private :: print_max_allocated_memory = .false.
logical, private :: print_virtual_memory = .false.
logical, private :: print_flop_count = .false.
logical, private :: print_flop_rate = .false.
logical, private :: print_ldst = .false.
logical, private :: print_memory_transferred = .false.
logical, private :: print_memory_bandwidth = .false.
logical, private :: print_ai = .false.
integer, private :: bytes_per_ldst = 8
type(node_t), private, pointer :: root => NULL() !< Start of graph
type(node_t), private, pointer :: current_node => NULL() !< Current position in the graph
......@@ -162,7 +161,17 @@ module ftimings
end interface
interface
function loads_stores_init() result(ret) bind(C, name="ftimings_loads_stores_init")
subroutine flop_counter(flop) bind(C, name="ftimings_flop_counter")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_LONG_LONG), intent(out) :: flop
end subroutine
end interface
#endif
#ifdef HAVE_PERF
interface
function perf_memory_counters_init() result(ret) bind(C, name="ftimings_perf_memory_counters_init")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_INT) :: ret
......@@ -170,10 +179,10 @@ module ftimings
end interface
interface
subroutine papi_counters(flops, ldst) bind(C, name="ftimings_papi_counters")
subroutine perf_memory_counters(mem_reads, mem_writes) bind(C, name="ftimings_perf_memory_counters")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_LONG_LONG), intent(out) :: flops, ldst
integer(kind=C_INT64_T), intent(out) :: mem_reads, mem_writes
end subroutine
end interface
#endif
......@@ -260,15 +269,15 @@ module ftimings
logical, intent(in) :: enabled
if (enabled) then
#ifdef HAVE_LIBPAPI
if (loads_stores_init() == 1) then
#ifdef HAVE_PERF
if (perf_memory_counters_init() == 1) then
self%record_memory_bandwidth = .true.
else
write(0,'(a)') "ftimings: Could not initialize PAPI, disabling memory bandwidth counter"
write(0,'(a)') "ftimings: Could not initialize Linux perf, disabling memory counters"
self%record_memory_bandwidth = .false.
endif
#else
write(0,'(a)') "ftimings: not compiled with PAPI support, disabling memory bandwidth counter"
write(0,'(a)') "ftimings: not compiled with Linux perf support, disabling memory counters"
self%record_memory_bandwidth = .false.
#endif
else
......@@ -327,24 +336,21 @@ module ftimings
!> resident memory ("high water mark")
!> \param print_flop_count Number of floating point operations
!> \param print_flop_rate Rate of floating point operations per second
!> \param print_ldst Number of loads+stores
!> \param print_memory_bandwidth Rate of loads+stores per second
!> \param print_memory_transferred Memory transferred from RAM to CPU
!> \param print_memory_bandwidth Memory bandwidth from RAM to CPU
!> \param print_ai Arithmetic intensity, that is number of
!> floating point operations per
!> number of load and store
!> number of bytes transferred
!> operations (currently untested)
!> \param bytes_per_ldst For calculating the AI, assume this number
!> of bytes per load or store (default: 8)
subroutine timer_set_print_options(self, &
print_allocated_memory, &
print_virtual_memory, &
print_max_allocated_memory, &
print_flop_count, &
print_flop_rate, &
print_ldst, &
print_allocated_memory, &
print_virtual_memory, &
print_max_allocated_memory, &
print_flop_count, &
print_flop_rate, &
print_memory_transferred, &
print_memory_bandwidth, &
print_ai, &
bytes_per_ldst)
print_ai)
class(timer_t), intent(inout) :: self
logical, intent(in), optional :: &
print_allocated_memory, &
......@@ -352,10 +358,9 @@ module ftimings
print_max_allocated_memory, &
print_flop_count, &
print_flop_rate, &
print_ldst, &
print_memory_transferred, &
print_memory_bandwidth, &
print_ai
integer, intent(in), optional :: bytes_per_ldst
if (present(print_allocated_memory)) then
self%print_allocated_memory = print_allocated_memory
......@@ -392,16 +397,16 @@ module ftimings
endif
endif
if (present(print_ldst)) then
self%print_ldst = print_ldst
if ((.not. self%record_memory_bandwidth) .and. self%print_ldst) then
write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros!"
if (present(print_memory_transferred)) then
self%print_memory_transferred = print_memory_transferred
if ((.not. self%record_memory_bandwidth) .and. self%print_memory_transferred) then
write(0,'(a)') "ftimings: Warning: Memory counters were disabled, expect zeros!"
endif
endif
if (present(print_memory_bandwidth)) then
self%print_memory_bandwidth = print_memory_bandwidth
if ((.not. self%record_memory_bandwidth) .and. self%print_memory_bandwidth) then
write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros for memory bandwidth!"
write(0,'(a)') "ftimings: Warning: Memory counters were disabled, expect zeros for memory bandwidth!"
endif
endif
......@@ -411,10 +416,6 @@ module ftimings
write(0,'(a)') "ftimings: Warning: Memory bandwidth or FLOP counters were disabled, expect invalid values for AI"
endif
endif
if (present(bytes_per_ldst)) then
self%bytes_per_ldst = bytes_per_ldst
endif
end subroutine
!> Start a timing section
......@@ -609,17 +610,19 @@ module ftimings
! I hate fortran's string handling
character(len=name_length), parameter :: group = "Group"
character(len=12), parameter :: seconds = " [s]"
character(len=12), parameter :: fract = " fraction"
character(len=12), parameter :: ram = " alloc. RAM"
character(len=12), parameter :: vmem = " alloc. VM"
character(len=12), parameter :: hwm = " alloc. HWM"
character(len=12), parameter :: flop_rate = " Mflop/s"
character(len=12), parameter :: flop_count = " Mflop"
character(len=12), parameter :: ldst = "loads+stores"
character(len=12), parameter :: bandwidth = " mem bandw."
character(len=12), parameter :: ai = "arithm. Int."
character(len=12), parameter :: dash = "============"
character(len=12), parameter :: seconds = " [s]"
character(len=12), parameter :: fract = " fraction"
character(len=12), parameter :: ram = " alloc. RAM"
character(len=12), parameter :: vmem = " alloc. VM"
character(len=12), parameter :: hwm = " alloc. HWM"
character(len=12), parameter :: flop_rate = " Mflop/s"
character(len=12), parameter :: flop_count = " Mflop"
character(len=12), parameter :: mem_reads = " RAM read"
character(len=12), parameter :: mem_writes = " RAM written"
character(len=12), parameter :: bandwidth_read = " RAM read/s"
character(len=12), parameter :: bandwidth_write = " RAM write/s"
character(len=12), parameter :: ai = "arithm. Int."
character(len=12), parameter :: dash = "============"
if (.not. self%active) then
return
......@@ -683,11 +686,13 @@ module ftimings
if (self%print_flop_rate) then
write(unit_act,'(1x,a12)',advance='no') flop_rate
endif
if (self%print_ldst) then
write(unit_act,'(1x,a12)',advance='no') ldst
if (self%print_memory_transferred) then
write(unit_act,'(1x,a12)',advance='no') mem_reads
write(unit_act,'(1x,a12)',advance='no') mem_writes
endif
if (self%print_memory_bandwidth) then
write(unit_act,'(1x,a12)',advance='no') bandwidth
write(unit_act,'(1x,a12)',advance='no') bandwidth_read
write(unit_act,'(1x,a12)',advance='no') bandwidth_write
endif
if (self%print_ai) then
write(unit_act,'(1x,a12)',advance='no') ai
......@@ -716,11 +721,13 @@ module ftimings
if (self%print_flop_rate) then
write(unit_act,'(1x,a12)',advance='no') dash
endif
if (self%print_ldst) then
if (self%print_memory_transferred) then
write(unit_act,'(1x,a12)',advance='no') dash
write(unit_act,'(1x,a12)',advance='no') dash
endif
if (self%print_memory_bandwidth) then
write(unit_act,'(1x,a12)',advance='no') dash
write(unit_act,'(1x,a12)',advance='no') dash
endif
if (self%print_ai) then
write(unit_act,'(1x,a12)',advance='no') dash
......@@ -956,10 +963,17 @@ module ftimings
endif
#ifdef HAVE_LIBPAPI
if (self%timer%record_flop_counts .or. self%timer%record_memory_bandwidth) then
call papi_counters(val%flop_count, val%ldst)
if (self%timer%record_flop_counts) then
call flop_counter(val%flop_count)
endif
#endif
#ifdef HAVE_PERF
if (self%timer%record_memory_bandwidth) then
call perf_memory_counters(val%mem_reads, val%mem_writes)
endif
#endif
end function
......@@ -1193,7 +1207,6 @@ module ftimings
integer, intent(in), optional :: unit
type(node_t), pointer :: node
integer :: i
type(value_t) :: cur_value, node_value, own_value, below_threshold_value, total_act
type(node_t), pointer :: own_node, threshold_node
real(kind=rk) :: threshold_act
......@@ -1430,14 +1443,16 @@ module ftimings
if (timer%print_flop_rate) then
write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / value%micros
endif
if (timer%print_ldst) then
write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst, kind=rk))
if (timer%print_memory_transferred) then
write(unit,'(1x,a12)',advance='no') nice_format(real(value%mem_reads, kind=rk))
write(unit,'(1x,a12)',advance='no') nice_format(real(value%mem_writes, kind=rk))
endif
if (timer%print_memory_bandwidth) then
write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst*timer%bytes_per_ldst, kind=rk) / (value%micros * 1e-6_rk))
write(unit,'(1x,a12)',advance='no') nice_format(real(value%mem_reads, kind=rk) / (value%micros * 1e-6_rk))
write(unit,'(1x,a12)',advance='no') nice_format(real(value%mem_writes, kind=rk) / (value%micros * 1e-6_rk))
endif
if (timer%print_ai) then
write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / value%ldst / timer%bytes_per_ldst
write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / (value%mem_writes + value%mem_reads)
endif
write(unit,'(a)') ""
......
......@@ -32,7 +32,8 @@ module ftimings_value
integer(kind=C_LONG) :: rsssize = 0 ! newly used resident memory
integer(kind=C_LONG_LONG) :: flop_count = 0 ! floating point operations done in this node
integer(kind=C_LONG_LONG) :: ldst = 0 ! number of loads and stores
integer(kind=C_INT64_T) :: mem_reads = 0 ! bytes read from DRAM
integer(kind=C_INT64_T) :: mem_writes = 0 ! bytes written to DRAM
end type
interface operator(+)
......@@ -61,7 +62,10 @@ module ftimings_value
c%maxrsssize = a%maxrsssize + b%maxrsssize
#ifdef HAVE_LIBPAPI
c%flop_count = a%flop_count + b%flop_count
c%ldst = a%ldst + b%ldst
#endif
#ifdef HAVE_PERF
c%mem_reads = a%mem_reads + b%mem_reads
c%mem_writes = a%mem_writes + b%mem_writes
#endif
end function
......@@ -74,7 +78,10 @@ module ftimings_value
c%maxrsssize = a%maxrsssize - b%maxrsssize
#ifdef HAVE_LIBPAPI
c%flop_count = a%flop_count - b%flop_count
c%ldst = a%ldst - b%ldst
#endif
#ifdef HAVE_PERF
c%mem_reads = a%mem_reads - b%mem_reads
c%mem_writes = a%mem_writes - b%mem_writes
#endif
end function
......@@ -87,7 +94,10 @@ module ftimings_value
neg_a%maxrsssize = - a%maxrsssize
#ifdef HAVE_LIBPAPI
neg_a%flop_count = - a%flop_count
neg_a%ldst = - a%ldst
#endif
#ifdef HAVE_PERF
neg_a%mem_reads = - a%mem_reads
neg_a%mem_writes = - a%mem_writes
#endif
end function
end module
......@@ -18,6 +18,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
......@@ -28,12 +30,11 @@ static int event_set;
static int tried_papi_init = 0;
static int papi_available = 0;
static int flops_available = 0;
static int ldst_available = 0;
#ifdef HAVE_LIBPAPI
#include <papi.h>
int ftimings_papi_init(void) {
static int papi_init(void) {
int ret;
if (tried_papi_init) {
......@@ -41,10 +42,10 @@ int ftimings_papi_init(void) {
}
#pragma omp critical
{
do {
/* Think about it :) */
if (tried_papi_init) {
goto end;
break;
}
tried_papi_init = 1;
......@@ -54,16 +55,19 @@ int ftimings_papi_init(void) {
if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
goto error;
papi_available = 0;
break;
}
if ((ret = PAPI_create_eventset(&event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
goto error;
papi_available = 0;
break;
}
/* Check FLOP counter availability */
/* FLOP counter
*/
if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
......@@ -76,70 +80,24 @@ int ftimings_papi_init(void) {
flops_available = 1;
}
/* Loads + Stores */
if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
ldst_available = 0;
} else {
ldst_available = 1;
}
/* Start */
if ((ret = PAPI_start(event_set)) < 0) {
fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
goto error;
}
goto end;
error:
/* PAPI works */
papi_available = 0;
end:
/* PAPI works */
papi_available = 1;
} /* End of critical region */
} while(0); /* End of critical region */
return papi_available;
}
int ftimings_flop_init(void) {
int ret;
if (!tried_papi_init) {
ftimings_papi_init();
papi_init();
}
return flops_available;
}
int ftimings_loads_stores_init(void) {
int ret;
if (!tried_papi_init) {
ftimings_papi_init();
}
return ldst_available;
}
void ftimings_papi_counters(long long *flops, long long *ldst) {
void ftimings_flop_counter(long long *flop) {
long long res[3];
int i, ret;
......@@ -150,15 +108,9 @@ void ftimings_papi_counters(long long *flops, long long *ldst) {
i = 0;
if (flops_available) {
*flops = res[i++];
} else {
*flops = 0LL;
}
if (ldst_available) {
*ldst = res[i++];
*ldst += res[i++];
*flop = res[i++];
} else {
*ldst = 0LL;
*flop = 0LL;
}
}
#endif
/* Copyright 2014 Lorenz Hüdepohl
*
* This file is part of ftimings.
*
* ftimings is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ftimings is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with ftimings. If not, see <http://www.gnu.org/licenses/>.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static int perf_available = 0;
static int tried_perf_init = 0;
static int mem_reads_fd = -1, mem_writes_fd = -1;
static inline int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
return ret;
}
static void ftimings_perf_memory_counters_uninit(void) {
if (mem_reads_fd > 0) {
close(mem_reads_fd);
}
if (mem_writes_fd > 0) {
close(mem_writes_fd);
}
}
int ftimings_perf_memory_counters_init(void) {
struct perf_event_attr pe;
int type;
FILE *ff;
pid_t pid = -1;
int cpu = 0;
unsigned long flags = 0;
unsigned int event_reads = 0, event_writes = 0;