Commit c9a7e72c authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

C-API for ftimings

This is rather big update, ftimings can now be used via a C-Api, see
test/c_test.c for an example on how to use it.

This step lead to a slight overhaul also of the Fortran API, there are
now also a number of ..._node member functions of timer_t that can be
used if cannot or does not want to specify the node of interest via an
explicit chain of names. An example:

Instead of

  type(timer_t) :: timer

  ...

  call timer%print("foo", "bar", "baz")

one can now also do

  type(timer_t) :: timer
  type(node_t) :: node

  ...

  node = timer%get_root_node()
  node = node%get_child("foo")
  node = node%get_child("bar")
  node = node%get_child("baz")

  call timer%print_node(node)

This construction might sometimes be necessary, e.g. if the hierarchy is
very dynamic or if the current provided maximum number of six levels in
the non-_node functions is not sufficient (but think about if you REALLY
need more than six levels..).

This is similarly done on the C-side, there is even no restriction on
the number of levels by using variadic lists. Still, also there _node
functions are provided. All C-API symbols are prefixed with "ftimings_"
in order to avoid name clashes.
parent 49797bea
......@@ -15,7 +15,8 @@ libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES = \
ftimings/highwater_mark.c \
ftimings/ftimings_type.F90 \
ftimings/ftimings_value.F90 \
ftimings/ftimings.F90
ftimings/ftimings.F90 \
ftimings/ftimings_c_support.c
if HAVE_LIBPAPI
libftimings_@FTIMINGS_API_VERSION@_@FC@_la_SOURCES += \
......@@ -31,13 +32,17 @@ libftimings_@FTIMINGS_API_VERSION@_@FC@_la_LDFLAGS = -version-info $(FTIMINGS_SO
ftimings_includedir = $(includedir)/ftimings-$(FTIMINGS_API_VERSION)-$(FC)
nobase_ftimings_include_HEADERS = ftimings.mod
ftimings_include_HEADERS = ftimings/ftimings.h ftimings/ftimings_generated.h
ftimings_libincludedir = $(libdir)/ftimings-$(FTIMINGS_API_VERSION)-$(FC)/include
./ftimings/ftimings_generated.h: $(srcdir)/ftimings/ftimings.F90
grep "^ *!c>" $(srcdir)/ftimings/ftimings.F90 | sed 's/^ *!c>//;' > ftimings/ftimings_generated.h
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = ftimings-$(FTIMINGS_API_VERSION)-$(FC).pc
# programs
bin_PROGRAMS = ftimings_@FC@_test
bin_PROGRAMS = ftimings_@FC@_test ftimings_c_test
# test
ftimings_@FC@_test_SOURCES = \
......@@ -47,6 +52,12 @@ ftimings_@FC@_test_SOURCES = \
ftimings_@FC@_test_LDADD = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
ftimings_@FC@_test_LDFLAGS = -static
ftimings_c_test_SOURCES = \
test/c_test.c
ftimings_c_test_LDADD = libftimings-@FTIMINGS_API_VERSION@-@FC@.la
ftimings_c_test_LINK = $(FCLINK) -static
noinst_PROGRAMS = do_flops do_memory_transfer
do_flops_SOURCES = \
......
This diff is collapsed.
#ifndef _FTIMINGS_H
#define _FTIMINGS_H
#include <ftimings/ftimings_generated.h>
#define __ftimings_node_from_args(node, last_arg, error_return) \
do { \
va_list args; \
va_start(args, last_arg); \
node = ftimings_node(timer, args); \
va_end(args); \
\
if (!node) { \
debug("ftimings_get_node() returned NULL"); \
error_return; \
} \
} while(0)
#define ftimings_in_entries(timer, ...) ftimings_in_entries_impl(timer, ##__VA_ARGS__, NULL)
double ftimings_in_entries_impl(ftimer_t *timer, ...);
#define ftimings_print(timer, threshold, ...) ftimings_print_impl(timer, threshold, ##__VA_ARGS__, NULL)
void ftimings_print_impl(ftimer_t *timer, double threshold, ...);
#define ftimings_get(timer, ...) ftimings_get_impl(timer, ##__VA_ARGS__, NULL)
double ftimings_get_impl(ftimer_t *timer, ...);
#define ftimings_since(timer, ...) ftimings_since_impl(timer, ##__VA_ARGS__, NULL)
double ftimings_since_impl(ftimer_t *timer, ...);
#endif
#define FTIMINGS_PRIVATE
#include <ftimings/ftimings.h>
#include <stddef.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
static int looked_for_verbose_debug = 0;
static int verbose_debug = 0;
void debug(const char *format, ...) {
if(!looked_for_verbose_debug) {
char *debug = getenv("FTIMINGS_DEBUG");
if (debug && strncmp(debug, "1", 1) == 0) {
verbose_debug = 1;
}
looked_for_verbose_debug = 1;
}
if (verbose_debug) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
}
}
ftimer_node_t* ftimings_node(ftimer_t *timer, va_list args) {
const char *name;
ftimer_node_t *node;
node = ftimings_root_node(timer);
while((name = va_arg(args, const char*)) != NULL) {
node = ftimings_node_get_child(node, name);
if (node == NULL) {
debug("Cannot descend to %s\n", name);
ftimings_error(timer, "ftimings_get_node(): Cannot descend to node");
return NULL;
}
}
return node;
}
double ftimings_in_entries_impl(ftimer_t *timer, ...) {
va_list args;
const char *name, *next_name;
ftimer_node_t *node;
node = ftimings_root_node(timer);
va_start(args, timer);
name = va_arg(args, const char*);
if (name == NULL) {
ftimings_error(timer, "ftimings_in_entries(): Missing argument");
return NAN;
}
while((next_name = va_arg(args, const char*)) != NULL) {
node = ftimings_node_get_child(node, name);
if (node == NULL) {
debug("Cannot descend to %s\n", name);
ftimings_error(timer, "ftimings_in_entries(): Cannot descend to node");
return NAN;
}
name = next_name;
}
return ftimings_in_entries_node(timer, node, name);
}
void ftimings_print_impl(ftimer_t *timer, double threshold, ...) {
ftimer_node_t *node;
__ftimings_node_from_args(node, threshold, return);
ftimings_print_node(timer, threshold, node);
}
double ftimings_get_impl(ftimer_t *timer, ...) {
ftimer_node_t *node;
__ftimings_node_from_args(node, timer, return NAN);
return ftimings_get_node(timer, node);
}
double ftimings_since_impl(ftimer_t *timer, ...) {
ftimer_node_t *node;
__ftimings_node_from_args(node, timer, return NAN);
return ftimings_since_node(timer, node);
}
......@@ -16,7 +16,8 @@
! along with ftimings. If not, see <http://www.gnu.org/licenses/>.
module ftimings_type
use, intrinsic :: iso_c_binding, only : C_INT64_T, C_DOUBLE, C_LONG_LONG, C_LONG, C_INT
use, intrinsic :: iso_c_binding, only : &
C_INT64_T, C_DOUBLE, C_LONG_LONG, C_LONG, C_INT
implicit none
integer, parameter :: rk = C_DOUBLE
integer, parameter :: name_length = 40
......
......@@ -34,6 +34,10 @@ static int flops_available = 0;
#ifdef HAVE_LIBPAPI
#include <papi.h>
static void papi_uninit(void) {
PAPI_shutdown();
}
static int papi_init(void) {
int ret;
......@@ -69,7 +73,7 @@ static int papi_init(void) {
/* FLOP counter
*/
if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_INS): %s\n",
__FILE__, __LINE__, PAPI_strerror(ret));
flops_available = 0;
} else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) {
......@@ -83,6 +87,7 @@ static int papi_init(void) {
/* PAPI works */
papi_available = 1;
atexit(papi_uninit);
} while(0); /* End of critical region */
......
......@@ -20,8 +20,10 @@
#include "config.h"
#endif
#include <stdio.h>
#define _XOPEN_SOURCE
#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
......@@ -29,6 +31,7 @@
#include <inttypes.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/unistd.h>
static int perf_available = 0;
......
#include <stdio.h>
#include <stddef.h>
#include <unistd.h>
#include <ftimings/ftimings.h>
static void error_handler(ftimer_t *timer, void *handle, const char *string) {
fprintf(stderr, "Got error on timer '%s': %s\n", (char *) handle, string);
}
int main(int argc, char **argv) {
ftimer_t *timer;
const char *timername = "mytimer";
int i = 0;
timer = ftimings_create();
ftimings_register_error_handler(timer, error_handler, (void*) timername);
ftimings_enable(timer);
ftimings_start(timer, "foobar");
sleep(1);
ftimings_start(timer, "blafasel");
sleep(1);
ftimings_stop(timer, "blafasel");
ftimings_stop(timer, "foobar");
printf("\"foobar\" entry:\n");
ftimings_print(timer, 0.0, "foobar");
printf("\n");
printf("\"foobar->blafasel\" entry:\n");
ftimings_print(timer, 0.0, "foobar", "blafasel");
printf("\n");
/* Loop with just _start and _stop calls to measure overhead */
ftimings_start(timer, "overhead");
for (i = 0; i < 1000000; i++) {
ftimings_start(timer, "empty");
ftimings_stop(timer, "empty");
}
ftimings_stop(timer, "overhead");
/* Now with PAPI enabled */
ftimings_measure_flops(timer, 1);
ftimings_start(timer, "overhead_papi");
for (i = 0; i < 1000000; i++) {
ftimings_start(timer, "empty");
ftimings_stop(timer, "empty");
}
ftimings_stop(timer, "overhead_papi");
/* Now with PERF enabled */
ftimings_measure_memory_bandwidth(timer, 1);
ftimings_measure_flops(timer, 0);
ftimings_start(timer, "overhead_perf");
for (i = 0; i < 100000; i++) {
ftimings_start(timer, "empty");
ftimings_stop(timer, "empty");
}
ftimings_stop(timer, "overhead_perf");
printf("Full tree:\n");
ftimings_print_flop_count(timer, 1);
ftimings_print_memory_transferred(timer, 1);
ftimings_print(timer, 0.0);
printf("\n");
ftimings_sort(timer);
printf("Full tree sorted:\n");
ftimings_print(timer, 0.0);
printf("\n");
printf("Time spent in \"blafasel\" entries: %g\n",
ftimings_in_entries(timer, "blafasel"));
printf("Time spent in \"blafasel\" entries below \"foobar\": %g\n",
ftimings_in_entries(timer, "foobar", "blafasel"));
printf("\nOverhead of ftimings_start() + ftimings_stop() calls:\n %.2g microseconds\n",
ftimings_get(timer, "overhead"));
printf("\nIncluding PAPI (if usable):\n %.2g microseconds\n",
ftimings_get(timer, "overhead_papi"));
printf("\nIncluding PERF (if usable):\n %.2g microseconds\n",
10 * ftimings_get(timer, "overhead_perf"));
printf("\nTime since program start: %.2g seconds\n", ftimings_since(timer));
ftimings_destroy(timer);
return 0;
}
#define N 10000000L
#define N 1000000L
double a[N];
double b[N];
double c[N];
double d[N];
/* This should produce 20 million floating point operations,
/* This should produce 2 million floating point operations,
* with an arithmetic intensity of
*
* AI = #FLOP / #BYTES = 2 / (4 * sizeof(double)) = 0.0625
......
......@@ -5,13 +5,12 @@
#endif
#define mebi (1024L * 1024L)
#define gibi (1024L * mebi)
volatile char *p = NULL;
void fill_1_gib(void) {
void fill_100_mebi(void) {
char c = 0x42;
long i;
volatile char *p = NULL;
#ifdef NON_TEMPORAL
__m128i v = _mm_set_epi8(c, c, c, c,
c, c, c, c,
......@@ -21,14 +20,12 @@ void fill_1_gib(void) {
int j;
#endif
if (p == NULL) {
if ((p = malloc(gibi)) == NULL) {
perror("malloc()");
exit(1);
}
if ((p = malloc(100L * mebi)) == NULL) {
perror("malloc()");
exit(1);
}
for (i = 0; i < gibi; i+=64) {
for (i = 0; i < 100L * mebi; i+=64) {
#ifdef NON_TEMPORAL
_mm_stream_si128((__m128i *)&p[i+ 0], v);
_mm_stream_si128((__m128i *)&p[i+16], v);
......@@ -40,11 +37,13 @@ void fill_1_gib(void) {
}
#endif
}
free((void *) p);
}
#ifdef TEST_DO_MEMORY_TRANSFER
int main(int argc, char **argv) {
fill_1_gib();
fill_100_mebi();
return 0;
}
#endif
......@@ -72,9 +72,11 @@ program test_timings
call timer%print("program", "cycle")
endif
write(*,*)
write(*,'(a,f12.6)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle")
write(*,'(a,f12.6)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle")
write(*,'(a,f12.6)') " cycle total : ", timer%get("program", "cycle")
write(*,'(a,f12.6)') " cycle total : ", timer%get("program", "cycle")
write(*,'(a,f12.6)') " in c entries: ", timer%in_entries("c")
write(*,'(a,f8.2,a)') " c part: ", timer%in_entries("c") / timer%get("program", "cycle") * 100, "%"
write(*,'(a,f12.6)') " in b entries: ", timer%in_entries("b")
write(*,'(a,f8.2,a)') " b part: ", timer%in_entries("b") / timer%get("program", "cycle") * 100, "%"
#ifndef _OPENMP
write(*,'(a,f12.6)') " cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c")
#else
......@@ -96,15 +98,15 @@ program test_timings
write(*,*)
write(*,*) "Sorted:"
call timer%sort()
call timer%print("program", is_sorted=.true.)
call timer%print("program")
write(*,*)
write(*,*) "Ignoring entries <0.02s:"
call timer%print("program", is_sorted=.true., threshold=0.02_C_DOUBLE)
call timer%print("program", threshold=0.02_C_DOUBLE)
write(*,*)
write(*,*) "Whole tree:"
call timer%print(is_sorted=.true.)
call timer%print()
call timer%free()
......@@ -120,6 +122,8 @@ program test_timings
subroutine b()
integer :: i
call timer%start("b")
call timer%start("empty")
call timer%stop("empty")
!$omp parallel do
do i = 1, 4
call c
......@@ -136,18 +140,18 @@ program test_timings
end interface
interface
subroutine fill_1_gib() bind(C, name="fill_1_gib")
subroutine fill_100_mebi() bind(C, name="fill_100_mebi")
end subroutine
end interface
call timer%start("c")
call timer%start("20.0 Mflop, AI=0.0625")
call timer%start("2.0 Mflop, AI=0.0625")
call vector_triad()
call timer%stop()
call timer%start("Fill 1 GiB")
call fill_1_gib()
call timer%start("Fill 100 MiB")
call fill_100_mebi()
call timer%stop()
call timer%stop()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment