diff --git a/Doxyfile.in b/Doxyfile.in
index e52a9eb4feb9192e94b0193740663d92074dd6fe..d464ae3920efd75434b85119ed3248cb5f6d7d4a 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -733,7 +733,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT = ftimings/
+INPUT = @SRCDIR@/ftimings/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/doxygen.am b/doxygen.am
index cf37f1a5c538f3b4d58ad2b3ee38cca89fba49c9..f869a591a3ce75d6a872f8eae2a5b317f1453089 100644
--- a/doxygen.am
+++ b/doxygen.am
@@ -138,7 +138,7 @@ doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
 
 @DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
 	rm -rf @DX_DOCDIR@
-	$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
+	$(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG)
 
 DX_CLEANFILES = \
 	@DX_DOCDIR@/@PACKAGE@.tag \
diff --git a/ftimings/ftimings.F90 b/ftimings/ftimings.F90
index 0009263ab8b52067370d12d7902fe77854803f6f..8ba1b453e4015443ae98ca26228709cc7d495c71 100644
--- a/ftimings/ftimings.F90
+++ b/ftimings/ftimings.F90
@@ -2,9 +2,12 @@
 #include "config-f90.h"
 #endif
 
-!> Ftimings
+!> \mainpage Ftimings
 !>
-!> An almost pure-fortran attempt to play with graph structures :)
+!> An almost pure-fortran attempt to play with tree structures, which evolved
+!> into the timing library used e.g. by the VERTEX supernova code.
+!>
+!> All you need to know is contained in the \ref ftimings::timer_t derived type.
 module ftimings
   use ftimings_type
   use ftimings_value
@@ -14,19 +17,21 @@ module ftimings
 
   private
 
-  ! this is mainly needed for Doxygen...
+  ! this is mainly needed for Doxygen, they are
+  ! by implicitly reachable as type-bound procedures
+  ! of timer_t, however Doxygen does not document them
+  ! if they are not also public
   public timer_start, timer_stop, timer_free, timer_print, &
          timer_enable, timer_disable, timer_is_enabled, &
-         timer_in_entries, timer_get, timer_since, timer_sort
+         timer_in_entries, timer_get, timer_since, timer_sort, &
+         timer_set_print_options, &
+         timer_measure_flops, &
+         timer_measure_allocated_memory, &
+         timer_measure_memory_bandwidth
 
   character(len=name_length), private, parameter :: own = "(own)"
   character(len=name_length), private, parameter :: below = "(below threshold)"
 
-#ifdef HAVE_LIBPAPI
-  logical :: papi_supported = .false.
-  logical :: have_tried_papi_init = .false.
-#endif
-
   !> Type for a timer instance.
   !>
   !> Typical usage:
@@ -54,22 +59,33 @@ module ftimings
   !>   call timer%free()
   !> \endcode
   type, public :: timer_t
-    private
-    logical :: active = .false.                         !< If set to .false., most operations return immediately without any action
-    logical :: record_memory = .false.                  !< IF set to .true., record also the current resident set size
-    logical :: record_flop_counts = .false.             !< If set to .true., record also FLOP counts via PAPI calls
-    type(node_t), pointer :: root => NULL()             !< Start of graph
-    type(node_t), pointer :: current_node => NULL()     !< Current position in the graph
+    logical, private :: active = .false.                         !< If set to .false., most operations return immediately without any action
+    logical, private :: record_allocated_memory = .false.        !< IF set to .true., record also the current resident set size
+    logical, private :: record_flop_counts = .false.             !< If set to .true., record also FLOP counts via PAPI calls
+    logical, private :: record_memory_bandwidth = .false.        !< If set to .true., record also FLOP counts via PAPI calls
+
+    logical, private :: print_allocated_memory = .false.
+    logical, private :: print_flop_count = .false.
+    logical, private :: print_flop_rate = .false.
+    logical, private :: print_ldst = .false.
+    logical, private :: print_memory_bandwidth = .false.
+    logical, private :: print_ai = .false.
+    integer, private :: bytes_per_ldst = 8
+
+    type(node_t), private, pointer :: root => NULL()             !< Start of graph
+    type(node_t), private, pointer :: current_node => NULL()     !< Current position in the graph
     contains
       procedure, pass :: start => timer_start
       procedure, pass :: stop => timer_stop
       procedure, pass :: free => timer_free
       procedure, pass :: print => timer_print
       procedure, pass :: enable => timer_enable
-      procedure, pass :: measure_flops => timer_measure_flops
-      procedure, pass :: measure_memory => timer_measure_memory
       procedure, pass :: disable => timer_disable
       procedure, pass :: is_enabled => timer_is_enabled
+      procedure, pass :: measure_flops => timer_measure_flops
+      procedure, pass :: measure_allocated_memory => timer_measure_allocated_memory
+      procedure, pass :: measure_memory_bandwidth => timer_measure_memory_bandwidth
+      procedure, pass :: set_print_options => timer_set_print_options
       procedure, pass :: in_entries => timer_in_entries
       procedure, pass :: get => timer_get
       procedure, pass :: since => timer_since
@@ -113,7 +129,7 @@ module ftimings
 
 #ifdef HAVE_LIBPAPI
   interface
-    function papi_init() result(ret) bind(C, name="ftimings_papi_init")
+    function flop_init() result(ret) bind(C, name="ftimings_flop_init")
       use, intrinsic :: iso_c_binding
       implicit none
       integer(kind=C_INT) :: ret
@@ -121,12 +137,20 @@ module ftimings
   end interface
 
   interface
-    function current_flop_count() result(cnt) bind(C, name="ftimings_current_flop_count")
+    function loads_stores_init() result(ret) bind(C, name="ftimings_loads_stores_init")
       use, intrinsic :: iso_c_binding
       implicit none
-      integer(kind=C_LONG_LONG) :: cnt
+      integer(kind=C_INT) :: ret
     end function
   end interface
+
+  interface
+    subroutine papi_counters(flops, ldst) bind(C, name="ftimings_papi_counters")
+      use, intrinsic :: iso_c_binding
+      implicit none
+      integer(kind=C_LONG_LONG), intent(out) :: flops, ldst
+    end subroutine
+  end interface
 #endif
 
   interface
@@ -141,7 +165,6 @@ module ftimings
 
   !> Activate the timer, without this, most methods are non-ops.
   !>
-  !> \implements timer_t::enable
   subroutine timer_enable(self)
     class(timer_t), intent(inout), target :: self
 
@@ -151,65 +174,157 @@ module ftimings
   !> Call with enabled = .true. to also record amount of newly allocated memory.
   !> By default, memory usage is not recored. Call with .false. to deactivate again.
   !>
-  !> \implements timer_t::measure_memory
-  subroutine timer_measure_memory(self, enabled)
+  !> This opens /proc/self/statm, parses it, and closes it agagain and is thus
+  !> quite costly, use when appropriate.
+  !>
+  subroutine timer_measure_allocated_memory(self, enabled)
+    class(timer_t), intent(inout) :: self
+    logical, intent(in) :: enabled
+
+    self%record_allocated_memory = enabled
+  end subroutine
+
+  !> Call with enabled = .true. to also record the memory bandwidth with PAPI
+  !> By default, this is not recorded. Call with .false. to deactivate again.
+  !>
+  subroutine timer_measure_memory_bandwidth(self, enabled)
     class(timer_t), intent(inout) :: self
     logical, intent(in) :: enabled
 
-    self%record_memory = enabled
+    if (enabled) then
+#ifdef HAVE_LIBPAPI
+      if (loads_stores_init() == 1) then
+        self%record_memory_bandwidth = .true.
+      else
+        write(0,'(a)') "ftimings: Could not initialize PAPI, disabling memory bandwidth counter"
+        self%record_memory_bandwidth = .false.
+      endif
+#else
+      write(0,'(a)') "ftimings: not compiled with PAPI support, disabling memory bandwidth counter"
+      self%record_memory_bandwidth = .false.
+#endif
+    else
+      ! explicitly set to .false. by caller
+      self%record_memory_bandwidth = .false.
+    endif
   end subroutine
 
   !> Call with enabled = .true. to also record FLOP counts via PAPI calls.
   !> By default no FLOPS are recored. Call with .false. to deactivate again.
   !>
-  !> \implements timer_t::measure_flops
   subroutine timer_measure_flops(self, enabled)
     class(timer_t), intent(inout) :: self
     logical, intent(in) :: enabled
 
     if (enabled) then
 #ifdef HAVE_LIBPAPI
-!$OMP CRITICAL
-      if (.not. have_tried_papi_init) then
-        have_tried_papi_init = .true.
-        if (papi_init() == 1) then
-          papi_supported = .true.
-        else
-          papi_supported = .false.
-        endif
-      endif
-!$OMP END CRITICAL
-      if (.not. papi_supported) then
-        write(0,'(a)') "ftimings: Could not activate PAPI, disabling FLOP counter"
-        self%record_flop_counts = .false.
-      else
+      if (flop_init() == 1) then
         self%record_flop_counts = .true.
+      else
+        write(0,'(a)') "ftimings: Could not initialize PAPI, disabling FLOP counter"
+        self%record_flop_counts = .false.
       endif
 #else
       write(0,'(a)') "ftimings: not compiled with PAPI support, disabling FLOP counter"
       self%record_flop_counts = .false.
 #endif
     else
-      ! explicitly set to .false. by caller
+      ! Explicitly set to .false. by caller
       self%record_flop_counts = .false.
     endif
   end subroutine
 
-  !> Deactive the timer
+  !> Deactivate the timer
   !>
-  !> \implements timer_t::disable
   subroutine timer_disable(self)
     class(timer_t), intent(inout), target :: self
     self%active = .false.
   end subroutine
 
-  !> \implements timer_t::is_enabled
+  !> Return whether the timer is currently running
+  !>
   function timer_is_enabled(self) result(is)
     class(timer_t), intent(inout), target :: self
     logical :: is
     is = self%active
   end function
 
+  !> Control what to print on following %print calls
+  !>
+  !> \param     print_allocated_memory  Amount of newly allocated, resident memory
+  !> \param     print_flop_count        Number of floating point operations
+  !> \param     print_flop_rate         Rate of floating point operations per second
+  !> \param     print_ldst              Number of loads+stores
+  !> \param     print_memory_bandwidth  Rate of loads+stores per second
+  !> \param     print_ai                Arithmetic intensity, that is number of
+  !>                                    floating point operations per number of
+  !>                                    load and store operations (currently untested)
+  !> \param     bytes_per_ldst          For calculating the AI, assume this number
+  !>                                    of bytes per load or store (default: 8)
+  subroutine timer_set_print_options(self, &
+	print_allocated_memory, &
+	print_flop_count, &
+	print_flop_rate, &
+	print_ldst, &
+        print_memory_bandwidth, &
+	print_ai, &
+        bytes_per_ldst)
+    class(timer_t), intent(inout) :: self
+    logical, intent(in), optional :: &
+        print_allocated_memory, &
+        print_flop_count, &
+        print_flop_rate, &
+        print_ldst, &
+        print_memory_bandwidth, &
+        print_ai
+    integer, intent(in), optional :: bytes_per_ldst
+
+    if (present(print_allocated_memory)) then
+      self%print_allocated_memory = print_allocated_memory
+      if ((.not. self%record_allocated_memory) .and. self%print_allocated_memory) then
+         write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!"
+      endif
+    endif
+
+    if (present(print_flop_count)) then
+      self%print_flop_count = print_flop_count
+      if ((.not. self%record_flop_counts) .and. self%print_flop_count) then
+         write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
+      endif
+    endif
+
+    if (present(print_flop_rate)) then
+      self%print_flop_rate = print_flop_rate
+      if ((.not. self%record_flop_counts) .and. self%print_flop_rate) then
+         write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
+      endif
+    endif
+
+    if (present(print_ldst)) then
+      self%print_ldst = print_ldst
+      if ((.not. self%record_memory_bandwidth) .and. self%print_ldst) then
+         write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros!"
+      endif
+    endif
+    if (present(print_memory_bandwidth)) then
+      self%print_memory_bandwidth = print_memory_bandwidth
+      if ((.not. self%record_memory_bandwidth) .and. self%print_memory_bandwidth) then
+         write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros for memory bandwidth!"
+      endif
+    endif
+
+    if (present(print_ai)) then
+      self%print_ai = print_ai
+      if (.not. (self%record_memory_bandwidth .and. self%record_flop_counts)) then
+         write(0,'(a)') "ftimings: Warning: Memory bandwidth or FLOP counters were disabled, expect invalid values for AI"
+      endif
+    endif
+
+    if (present(bytes_per_ldst)) then
+      self%bytes_per_ldst = bytes_per_ldst
+    endif
+  end subroutine
+
   !> Start a timing section
   !>
   !> \param name        A descriptive name
@@ -236,7 +351,6 @@ module ftimings
   !>   call timer%stop("B")
   !> \endcode
   !>
-  !> \implements timer_t::start
   subroutine timer_start(self, name, replace)
     class(timer_t), intent(inout), target :: self
     character(len=*), intent(in)  :: name
@@ -281,7 +395,7 @@ module ftimings
       !$ do i = 0, omp_get_level()
       !$   write(*,*) "omp_get_ancestor_thread_num(", i, ") = ", omp_get_ancestor_thread_num(i)
       !$ end do
-      call self%root%print_graph(0, .false., .false., .false.)
+      call self%root%print_graph(0)
       !$omp end critical
       stop "timer_start() while same timer was active"
     endif
@@ -315,7 +429,6 @@ module ftimings
   !>                    If given, warns if it does not match the last %start()
   !>                    call on stderr and disables the current timer instance.
   !>
-  !> \implements timer_t::stop
   subroutine timer_stop(self, name)
     class(timer_t), intent(inout), target :: self
     character(len=*), intent(in), optional :: name
@@ -370,7 +483,6 @@ module ftimings
 
   !> Deallocate all objects associated with (but not including) self
   !>
-  !> \implements timer_t::free
   subroutine timer_free(self)
     class(timer_t), intent(inout), target :: self
     if (associated(self%root)) then
@@ -389,21 +501,16 @@ module ftimings
   !> \param threshold   If given, subsume any entries with a value of threshold
   !>                    seconds in a single node "(below threshold)"
   !> \param is_sorted   Assume a sorted graph for inserting "(own)" and "(below threshold)"
+  !> \param unit        The unit number on which to print, default stdout
   !>
-  !> \implements timer_t::print
-  subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit, &
-        print_memory, print_flop_count, print_flop_rate)
+  subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit)
     class(timer_t), intent(in), target :: self
     character(len=*), intent(in), optional :: name1, name2, name3, name4
     real(kind=rk), intent(in), optional :: threshold
     logical, intent(in), optional :: is_sorted
     integer, intent(in), optional :: unit
-    logical, intent(in), optional :: print_memory
-    logical, intent(in), optional :: print_flop_count, print_flop_rate
 
     integer :: unit_act
-    logical :: do_print_memory
-    logical :: do_print_flop_count, do_print_flop_rate
 
     type(node_t), pointer :: node
     character(len=64) :: format_spec
@@ -412,9 +519,13 @@ module ftimings
     character(len=name_length), parameter :: group = "Group"
     character(len=12), parameter :: seconds    = "         [s]"
     character(len=12), parameter :: fract      = "    fraction"
+    character(len=12), parameter :: ram        = "  alloc. RAM"
     character(len=12), parameter :: flop_rate  = "     Mflop/s"
     character(len=12), parameter :: flop_count = "       Mflop"
-    character(len=12), parameter :: ram        = "  alloc. RAM"
+    character(len=12), parameter :: ldst       = "loads+stores"
+    character(len=12), parameter :: bandwidth  = "  mem bandw."
+    character(len=12), parameter :: ai         = "arithm. Int."
+    character(len=12), parameter :: dash       = "============"
 
     if (.not. self%active) then
       return
@@ -456,53 +567,58 @@ module ftimings
       endif
     end if
 
-    if (present(print_memory)) then
-      do_print_memory = print_memory
-    else
-      do_print_memory = self%record_memory
-    endif
-    if ((.not. self%record_memory) .and. do_print_memory) then
-       write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!"
+    ! I really do hate it ..
+    write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length
+    write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract
+
+    if (self%print_allocated_memory) then
+      write(unit_act,'(1x,a12)',advance='no') ram
     endif
 
-    if (present(print_flop_count)) then
-      do_print_flop_count = print_flop_count
-    else
-      do_print_flop_count = self%record_flop_counts
+    if (self%print_flop_count) then
+      write(unit_act,'(1x,a12)',advance='no') flop_count
     endif
-    if ((.not. self%record_flop_counts) .and. do_print_flop_count) then
-       write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
+    if (self%print_flop_rate) then
+      write(unit_act,'(1x,a12)',advance='no') flop_rate
     endif
-
-    if (present(print_flop_rate)) then
-      do_print_flop_rate = print_flop_rate
-    else
-      do_print_flop_rate = self%record_flop_counts
+    if (self%print_ldst) then
+      write(unit_act,'(1x,a12)',advance='no') ldst
     endif
-    if ((.not. self%record_flop_counts) .and. do_print_flop_rate) then
-       write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
+    if (self%print_memory_bandwidth) then
+      write(unit_act,'(1x,a12)',advance='no') bandwidth
+    endif
+    if (self%print_ai) then
+      write(unit_act,'(1x,a12)',advance='no') ai
     endif
 
-    ! I really do hate it ..
-    write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length
-    write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract
+    write(unit_act,'(a)') ""
 
-    if (do_print_memory) then
-      write(unit_act,'(1x,a12)',advance='no') ram
+    write(format_spec,'("("" |  "",a",i0,",1x,2(1x,a12))")') name_length
+    write(unit_act, format_spec, advance='no') "", dash, dash
+
+    if (self%print_allocated_memory) then
+      write(unit_act,'(1x,a12)',advance='no') dash
     endif
 
-    if (do_print_flop_count) then
-      write(unit_act,'(1x,a12)',advance='no') flop_count
+    if (self%print_flop_count) then
+      write(unit_act,'(1x,a12)',advance='no') dash
     endif
-    if (do_print_flop_rate) then
-      write(unit_act,'(1x,a12)',advance='no') flop_rate
+    if (self%print_flop_rate) then
+      write(unit_act,'(1x,a12)',advance='no') dash
+    endif
+    if (self%print_ldst) then
+      write(unit_act,'(1x,a12)',advance='no') dash
+    endif
+    if (self%print_memory_bandwidth) then
+      write(unit_act,'(1x,a12)',advance='no') dash
+    endif
+    if (self%print_ai) then
+      write(unit_act,'(1x,a12)',advance='no') dash
     endif
 
     write(unit_act,'(a)') ""
 
-    call node%print_graph(0, &
-                          do_print_memory, do_print_flop_count, do_print_flop_rate, &
-                          threshold, is_sorted, unit=unit)
+    call node%print_graph(0, threshold, is_sorted, unit=unit)
 
   end subroutine
 
@@ -517,7 +633,6 @@ module ftimings
   !> For example timer%in_entries("foo", "bar", "parallel") returns
   !> the sum of all entries named "parallel" below the foo->bar node
   !>
-  !> \implements timer_t::in_entries
   function timer_in_entries(self, name1, name2, name3, name4) result(s)
     use, intrinsic :: iso_fortran_env, only : error_unit
     class(timer_t), intent(in), target :: self
@@ -572,7 +687,6 @@ module ftimings
   !> The result is only meaningfull if the entry was never appended by
   !> additional %start() calls.
   !>
-  !> \implements timer_t::get
   function timer_get(self, name1, name2, name3, name4, name5, name6) result(s)
     class(timer_t), intent(in), target :: self
     ! this is clunky, but what can you do..
@@ -643,7 +757,6 @@ module ftimings
   !> The result is only meaningfull if the entry was never appended by
   !> additional %start() calls.
   !>
-  !> \implements timer_t::since
   function timer_since(self, name1, name2, name3, name4) result(s)
     class(timer_t), intent(in), target :: self
     character(len=*), intent(in), optional :: name1, name2, name3, name4
@@ -693,7 +806,6 @@ module ftimings
   !> Sort the graph on each level.
   !> Warning: This irrevocable destroys the old ordering.
   !>
-  !> \implements timer_t::sort
   subroutine timer_sort(self)
     class(timer_t), intent(inout), target :: self
     type(node_t), pointer :: node
@@ -721,14 +833,13 @@ module ftimings
     ! current time
     val%micros = microseconds_since_epoch()
 
-    if (self%timer%record_memory) then
+    if (self%timer%record_allocated_memory) then
       val%rsssize = resident_set_size()
     endif
 
 #ifdef HAVE_LIBPAPI
-    ! flop counter
-    if (self%timer%record_flop_counts) then
-      val%flop_count = current_flop_count()
+    if (self%timer%record_flop_counts .or. self%timer%record_memory_bandwidth) then
+      call papi_counters(val%flop_count, val%ldst)
     endif
 #endif
   end function
@@ -927,12 +1038,9 @@ module ftimings
     end do
   end subroutine
 
-  subroutine node_print(self, indent_level, &
-        print_memory, print_flop_count, print_flop_rate, &
-        total, unit)
+  subroutine node_print(self, indent_level, total, unit)
     class(node_t), intent(inout) :: self
     integer, intent(in) :: indent_level
-    logical, intent(in) :: print_memory, print_flop_count, print_flop_rate
     type(value_t), intent(in) :: total
     type(value_t) :: val
     integer, intent(in) :: unit
@@ -954,18 +1062,13 @@ module ftimings
     else
       val = self%value
     endif
-    call val%print(indent_level, &
-        print_memory, print_flop_count, print_flop_rate, &
-        name, total, unit)
+    call print_value(val, self%timer, indent_level, name, total, unit)
   end subroutine
 
-  recursive subroutine node_print_graph(self, indent_level, &
-	    print_memory, print_flop_count, print_flop_rate, &
-        threshold, is_sorted, total, unit)
+  recursive subroutine node_print_graph(self, indent_level, threshold, is_sorted, total, unit)
     use, intrinsic :: iso_fortran_env, only : output_unit
     class(node_t), intent(inout) :: self
     integer, intent(in) :: indent_level
-    logical, intent(in) :: print_memory, print_flop_count, print_flop_rate
     real(kind=rk), intent(in), optional :: threshold
     logical, intent(in), optional :: is_sorted
     type(value_t), intent(in), optional :: total
@@ -1006,9 +1109,7 @@ module ftimings
       unit_act = output_unit
     endif
 
-    call self%print(indent_level, &
-        print_memory, print_flop_count, print_flop_rate, &
-        total_act, unit_act)
+    call self%print(indent_level, total_act, unit_act)
 
     own_value = cur_value - self%sum_of_children()
     below_threshold_value = self%sum_of_children_below(threshold)
@@ -1024,6 +1125,7 @@ module ftimings
         allocate(own_node)
         own_node%value = own_value
         own_node%name = own
+        own_node%timer => self%timer
         call insert_into_sorted_list(self%firstChild, own_node)
       endif
 
@@ -1032,20 +1134,17 @@ module ftimings
         allocate(threshold_node)
         threshold_node%value = below_threshold_value
         threshold_node%name = below
+        threshold_node%timer => self%timer
         call insert_into_sorted_list(self%firstChild, threshold_node)
       endif
 
     else
       ! print them first
       if (print_own) then
-        call own_value%print(indent_level + 1, &
-            print_memory, print_flop_count, print_flop_rate, &
-            own, cur_value, unit_act)
+        call print_value(own_value, self%timer, indent_level + 1, own, cur_value, unit_act)
       endif
       if (print_threshold) then
-        call below_threshold_value%print(indent_level + 1, &
-            print_memory, print_flop_count, print_flop_rate, &
-            below, cur_value, unit_act)
+        call print_value(below_threshold_value, self%timer, indent_level + 1, below, cur_value, unit_act)
       endif
     endif
 
@@ -1056,9 +1155,7 @@ module ftimings
       if (node_value%micros * 1e-6_rk >= threshold_act &
                 .or. associated(node, threshold_node) &
                 .or. associated(node, own_node)) then
-        call node%print_graph(indent_level + 1, &
-            print_memory, print_flop_count, print_flop_rate, &
-            threshold, is_sorted, cur_value, unit_act)
+        call node%print_graph(indent_level + 1, threshold, is_sorted, cur_value, unit_act)
       endif
       node => node%nextSibling
     end do
@@ -1176,4 +1273,69 @@ module ftimings
     enddo
   end subroutine
 
+  subroutine print_value(value, timer, indent_level, label, total, unit)
+    type(value_t), intent(in) :: value
+    type(timer_t), intent(in) :: timer
+    integer, intent(in) :: indent_level
+    character(len=name_length), intent(in) :: label
+    type(value_t), intent(in) :: total
+    integer, intent(in) :: unit
+
+    character(len=64) :: format_spec
+
+    write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length
+    write(unit,format_spec,advance='no') &
+      label, &
+      real(value%micros, kind=rk) * 1e-6_rk, &
+      real(value%micros, kind=rk) / real(total%micros, kind=rk)
+
+    if (timer%print_allocated_memory) then
+      write(unit,'(1x,a12)',advance='no') &
+        nice_format(real(value%rsssize, kind=C_DOUBLE))
+    endif
+    if (timer%print_flop_count) then
+      write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / 1e6_rk
+    endif
+    if (timer%print_flop_rate) then
+      write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / value%micros
+    endif
+    if (timer%print_ldst) then
+      write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst, kind=rk))
+    endif
+    if (timer%print_memory_bandwidth) then
+      write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst*timer%bytes_per_ldst, kind=rk) / (value%micros * 1e-6_rk))
+    endif
+    if (timer%print_ai) then
+      write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / value%ldst / timer%bytes_per_ldst
+    endif
+
+    write(unit,'(a)') ""
+  end subroutine
+
+  pure elemental function nice_format(number) result(string)
+    real(kind=C_DOUBLE), intent(in) :: number
+    character(len=12) :: string
+    real(kind=C_DOUBLE), parameter :: &
+        kibi = 2.0_C_DOUBLE**10, &
+        mebi = 2.0_C_DOUBLE**20, &
+        gibi = 2.0_C_DOUBLE**30, &
+        tebi = 2.0_C_DOUBLE**40, &
+        pebi = 2.0_C_DOUBLE**50
+
+    if (abs(number) >= pebi) then
+      write(string,'(es12.2)') number
+    else if (abs(number) >= tebi) then
+      write(string,'(f9.2,'' Ti'')') number / tebi
+    else if (abs(number) >= gibi) then
+      write(string,'(f9.2,'' Gi'')') number / gibi
+    else if (abs(number) >= mebi) then
+      write(string,'(f9.2,'' Mi'')') number / mebi
+    else if (abs(number) >= kibi) then
+      write(string,'(f9.2,'' ki'')') number / kibi
+    else
+      write(string,'(f12.2)') number
+    endif
+  end function
+
+
 end module
diff --git a/ftimings/ftimings_value.F90 b/ftimings/ftimings_value.F90
index 4586be465d962e1c0befe6ba035f3d3e744d3575..d329d7caf7351a719ce2192a9f074bf8214c84a8 100644
--- a/ftimings/ftimings_value.F90
+++ b/ftimings/ftimings_value.F90
@@ -8,11 +8,10 @@ module ftimings_value
   public
 
   type value_t
-    integer(kind=C_INT64_T) :: micros = 0          ! Cumulative microseconds spent in this node
-    integer(kind=C_LONG) :: rsssize = 0
-    integer(kind=C_LONG_LONG) :: flop_count = 0    ! Cumulative floating point operations done in this node
-    contains
-      procedure, pass :: print => print_value
+    integer(kind=C_INT64_T) :: micros = 0         ! microseconds spent in this node
+    integer(kind=C_LONG) :: rsssize = 0           ! newly used resident memory
+    integer(kind=C_LONG_LONG) :: flop_count = 0   ! floating point operations done in this node
+    integer(kind=C_LONG_LONG) :: ldst = 0         ! number of loads and stores
   end type
 
   interface operator(+)
@@ -37,6 +36,7 @@ module ftimings_value
     c%rsssize = a%rsssize + b%rsssize
 #ifdef HAVE_LIBPAPI
     c%flop_count = a%flop_count + b%flop_count
+    c%ldst = a%ldst + b%ldst
 #endif
   end function
 
@@ -47,6 +47,7 @@ module ftimings_value
     c%rsssize = a%rsssize - b%rsssize
 #ifdef HAVE_LIBPAPI
     c%flop_count = a%flop_count - b%flop_count
+    c%ldst = a%ldst - b%ldst
 #endif
   end function
 
@@ -57,65 +58,7 @@ module ftimings_value
     neg_a%rsssize = - a%rsssize
 #ifdef HAVE_LIBPAPI
     neg_a%flop_count = - a%flop_count
+    neg_a%ldst = - a%ldst
 #endif
   end function
-
-  subroutine print_value(self, indent_level, &
-        print_memory, print_flop_count, print_flop_rate, &
-        label, total, unit)
-    class(value_t), intent(in) :: self
-    integer, intent(in) :: indent_level
-    logical, intent(in) :: print_memory
-    logical, intent(in) :: print_flop_count, print_flop_rate
-    character(len=name_length), intent(in) :: label
-    type(value_t), intent(in) :: total
-    integer, intent(in) :: unit
-
-    character(len=64) :: format_spec
-
-    write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length
-    write(unit,format_spec,advance='no') &
-      label, &
-      real(self%micros, kind=rk) * 1e-6_rk, &
-      real(self%micros, kind=rk) / real(total%micros, kind=rk)
-
-    if (print_memory) then
-      write(unit,'(1x,a12)',advance='no') &
-        nice_format(real(self%rsssize, kind=C_DOUBLE))
-    endif
-    if (print_flop_count) then
-      write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / 1e6_rk
-    endif
-    if (print_flop_rate) then
-      write(unit,'(1x,f12.2)',advance='no') real(self%flop_count, kind=rk) / self%micros
-    endif
-
-    write(unit,'(a)') ""
-  end subroutine
-
-  pure elemental function nice_format(flops) result(string)
-    real(kind=C_DOUBLE), intent(in) :: flops
-    character(len=12) :: string
-    real(kind=C_DOUBLE), parameter :: &
-        kibi = 2.0_C_DOUBLE**10, &
-        mebi = 2.0_C_DOUBLE**20, &
-        gibi = 2.0_C_DOUBLE**30, &
-        tebi = 2.0_C_DOUBLE**40, &
-        pebi = 2.0_C_DOUBLE**50
-
-    if (abs(flops) >= pebi) then
-      write(string,'(es12.1)') flops
-    else if (abs(flops) >= tebi) then
-      write(string,'(f9.1,'' Ti'')') flops / tebi
-    else if (abs(flops) >= gibi) then
-      write(string,'(f9.1,'' Gi'')') flops / gibi
-    else if (abs(flops) >= mebi) then
-      write(string,'(f9.1,'' Mi'')') flops / mebi
-    else if (abs(flops) >= kibi) then
-      write(string,'(f9.1,'' ki'')') flops / kibi
-    else
-      write(string,'(f9.1,''  '')') flops
-    endif
-  end function
-
 end module
diff --git a/ftimings/papi.c b/ftimings/papi.c
index db3d86b4b660f3ce605323a90aff295de333ffdb..9e12c6c89bf05de5f2adcc9ecd649ddcc350ca84 100644
--- a/ftimings/papi.c
+++ b/ftimings/papi.c
@@ -5,51 +5,142 @@
 #include "config.h"
 #endif
 
-static int flops_event_set;
+static int event_set;
+
+static int tried_papi_init = 0;
+static int papi_available = 0;
+static int flops_available = 0;
+static int ldst_available = 0;
 
 #ifdef HAVE_LIBPAPI
 #include <papi.h>
 
 int ftimings_papi_init(void) {
 	int ret;
-	flops_event_set = PAPI_NULL;
 
-	if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
-		fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
-				__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
-		return 0;
-	}
-	if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
-		fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
-				__FILE__, __LINE__, PAPI_strerror(ret));
-		return 0;
+	if (tried_papi_init) {
+		return papi_available;
 	}
 
-	if ((ret = PAPI_create_eventset(&flops_event_set)) < 0) {
-		fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
-				__FILE__, __LINE__, PAPI_strerror(ret));
-		return 0;
-	}
-	if ((ret = PAPI_add_event(flops_event_set, PAPI_DP_OPS)) < 0) {
-		fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n",
-				__FILE__, __LINE__, PAPI_strerror(ret));
-		return 0;
-	}
-	if ((ret = PAPI_start(flops_event_set)) < 0) {
-		fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
-				__FILE__, __LINE__, PAPI_strerror(ret));
-		return 0;
+#pragma omp critical
+	{
+		/* Think about it :) */
+		if (tried_papi_init) {
+			goto end;
+		}
+
+		tried_papi_init = 1;
+
+		event_set = PAPI_NULL;
+
+		if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
+					__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
+			goto error;
+		}
+
+		if ((ret = PAPI_create_eventset(&event_set)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			goto error;
+		}
+
+		/* Check FLOP counter availability */
+		if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			flops_available = 0;
+		} else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			flops_available = 0;
+		} else {
+			flops_available = 1;
+		}
+
+		/* Loads + Stores */
+		if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			ldst_available = 0;
+		} else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			ldst_available = 0;
+		} else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			ldst_available = 0;
+		} else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			ldst_available = 0;
+		} else {
+			ldst_available = 1;
+		}
+
+		/* Start */
+		if ((ret = PAPI_start(event_set)) < 0) {
+			fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
+					__FILE__, __LINE__, PAPI_strerror(ret));
+			goto error;
+		}
+
+		goto end;
+
+error:
+		/* PAPI works */
+		papi_available = 0;
+
+end:
+		/* PAPI works */
+		papi_available = 1;
+
+	} /* End of critical region */
+
+	return papi_available;
+}
+
+int ftimings_flop_init(void) {
+	int ret;
+
+	if (!tried_papi_init) {
+		ftimings_papi_init();
 	}
-	return 1;
+
+	return flops_available;
 }
 
-long long ftimings_current_flop_count(void) {
-	long long count;
+int ftimings_loads_stores_init(void) {
 	int ret;
-	if ((ret = PAPI_read(flops_event_set, &count)) < 0) {
+
+	if (!tried_papi_init) {
+		ftimings_papi_init();
+	}
+
+	return ldst_available;
+}
+
+void ftimings_papi_counters(long long *flops, long long *ldst) {
+	long long res[3];
+	int i, ret;
+
+	if ((ret = PAPI_read(event_set, &res[0])) < 0) {
 		fprintf(stderr, "PAPI_read: %s\n", PAPI_strerror(ret));
 		exit(1);
 	}
-	return count;
+
+	i = 0;
+	if (flops_available) {
+		*flops = res[i++];
+	} else {
+		*flops = 0LL;
+	}
+	if (ldst_available) {
+		*ldst = res[i++];
+		*ldst += res[i++];
+	} else {
+		*ldst = 0LL;
+	}
 }
 #endif
diff --git a/ftimings/resident_set_size.c b/ftimings/resident_set_size.c
index 5ce519f8719c780316d4d2cdaf18ca44aea939fc..fc395b0e0fd94b751c08f78cc1bd4ce105698db5 100644
--- a/ftimings/resident_set_size.c
+++ b/ftimings/resident_set_size.c
@@ -1,18 +1,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-/*#include <sys/time.h>
-#include <sys/resource.h>*/
-
 long ftimings_resident_set_size() {
-/*	struct rusage usage;
-	if (getrusage(RUSAGE_SELF, &usage) != 0) {
-		perror("getrusage");
-		exit(1);
-	}
-	return usage.ru_maxrss;
-*/
-
 	long rss = 0L;
 	FILE* fp = NULL;
 	if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) {
diff --git a/test/do_flops.c b/test/do_flops.c
index 31701ef665dc511158402517eb137cb003cb8029..ab6f60cc24cf05904a4e6dc64247f313455ee0b9 100644
--- a/test/do_flops.c
+++ b/test/do_flops.c
@@ -1,11 +1,14 @@
-volatile double a = 0.25;
+#define N 1048576L
 
-double megaflop(void) {
+double a[N];
+double b[N];
+double c[N];
+double d[N];
+
+void vector_triad(void) {
 	int i;
-	double c = 0.5;
 
-	for (i=0; i < 500000; i++) {
-		c = c * a + a;
+	for (i=0; i < N; i++) {
+		a[i] = b[i] + c[i] * d[i];
 	}
-	return c;
 }
diff --git a/test/test_timings.F90 b/test/test_timings.F90
index 7d216defe91361ebadfff1783fdb262d558c2449..d253b7f82e34d452d90b4bb2a42d4db67462ca86 100644
--- a/test/test_timings.F90
+++ b/test/test_timings.F90
@@ -6,7 +6,14 @@ program test_timings
   integer :: i, j
 
   call timer%measure_flops(.true.)
-  call timer%measure_memory(.true.)
+  call timer%measure_allocated_memory(.true.)
+  call timer%measure_memory_bandwidth(.true.)
+
+  call timer%set_print_options(&
+                print_flop_count=.true., &
+                print_flop_rate=.true., &
+                print_memory_bandwidth=.true., &
+                print_ai=.true., bytes_per_ldst=16)
 
   call timer%enable()
 
@@ -40,16 +47,16 @@ program test_timings
         call timer%print("program")
       else
         ! usual printing of current subtree
-        call timer%print("program", "cycle", print_flop_count=.true.)
+        call timer%print("program", "cycle")
       endif
       write(*,*)
-      write(*,'(a,f9.6)') "  c part: ", timer%in_entries("c") / timer%get("program", "cycle")
-      write(*,'(a,f9.6)') "  b part: ", timer%in_entries("b") / timer%get("program", "cycle")
-      write(*,'(a,f9.6)') "  cycle total : ", timer%get("program", "cycle")
+      write(*,'(a,f12.6)') "  c part: ", timer%in_entries("c") / timer%get("program", "cycle")
+      write(*,'(a,f12.6)') "  b part: ", timer%in_entries("b") / timer%get("program", "cycle")
+      write(*,'(a,f12.6)') "  cycle total : ", timer%get("program", "cycle")
 #ifndef _OPENMP
-      write(*,'(a,f9.6)') "  cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c")
+      write(*,'(a,f12.6)') "  cycle -> a -> b -> c : ", timer%get("program", "cycle", "a", "b", "c")
 #else
-      write(*,'(a,f9.6)') "  cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c")
+      write(*,'(a,f12.6)') "  cycle -> a -> b -> c : ", timer%get("program", "cycle", "parallel", "a", "b", "c")
 #endif
       write(*,*)
     endif
@@ -84,6 +91,7 @@ program test_timings
 
   write(*,*)
   write(*,*) "Whole tree:"
+  call timer%set_print_options(print_ldst=.true.)
   call timer%print(is_sorted=.true.)
 
   call timer%free()
@@ -111,12 +119,14 @@ program test_timings
 
   subroutine c()
     interface
-      subroutine megaflop() bind(C, name="megaflop")
+      subroutine vector_triad() bind(C, name="vector_triad")
       end subroutine
     end interface
     integer :: i
     call timer%start("c")
-    call megaflop()
-    call timer%stop("c")
+    call timer%start("2.097 Mflop, AI=0.0625")
+    call vector_triad()
+    call timer%stop()
+    call timer%stop()
   end subroutine
 end program