Commit cb4c4ae7 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove assumed size from generic real kernel

The generic real kernel is now contained in a module, this allows
strict interface checking! It also does not use assumed size arrays
anymore. Both points increase the possibility to debug and find errors.

However, this might be performance critical! It is possible to
switch back to the old implementation if that turns out to
be beneficial w.r.t. performance. Timings with gfortran 4.9 on Intel
Haswell showed that the new implementation is about 30 percent faster
then the previous one
parent 33a94bfc
......@@ -39,7 +39,7 @@ if HAVE_DETAILED_TIMINGS
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
if WITH_COMPLEX_GENERIC_KERNEL
......@@ -308,6 +308,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
......
......@@ -666,6 +666,11 @@ DX_MAN_FEATURE(ON)
DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
AC_SUBST([WITH_MKL])
AC_SUBST([WITH_BLACS])
AC_SUBST([with_amd_bulldozer_kernel])
......
......@@ -53,26 +53,49 @@
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
!module real_generic_kernel
#include "config-f90.h"
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
#define PACK_REAL_TO_COMPLEX
#else
#undef PACK_REAL_TO_COMPLEX
#endif
#ifndef DESPERATELY_WANT_ASSUMED_SIZE
module real_generic_kernel
private
public double_hh_trafo_generic
contains
#endif
! private
! public double_hh_trafo_generic
!contains
subroutine double_hh_trafo_generic(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use iso_c_binding
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
real(kind=rk), intent(inout) :: q(ldq,*)
real(kind=rk), intent(in) :: hh(ldh,*)
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk), intent(inout) :: q(ldq,*)
real(kind=rk), intent(in) :: hh(ldh,*)
#else
real(kind=rk), intent(inout) :: q(1:ldq,1:nb+1)
real(kind=rk), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=rk) :: s
integer(kind=ik) :: i
real(kind=rk) :: s
integer(kind=ik) :: i
! equivalence(q(1,1),q_complex(1,1))
! Safety only:
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: double_hh_trafo_generic")
#endif
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
......@@ -84,20 +107,42 @@
! Do the Householder transformations
#ifndef DESPERATELY_WANT_ASSUMED_SIZE
! ! assign real data to compplex pointer
! call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
#endif
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_kernel_12_generic(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_12_generic(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
print *,"calling 8"
call hh_trafo_kernel_8_generic(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_8_generic(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
else if(nq-i+1 > 0) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
print *,"calling 4"
call hh_trafo_kernel_4_generic(q(i,1),hh, nb, ldq, ldh, s)
endif
#else
call hh_trafo_kernel_4_generic(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: double_hh_trafo_generic")
#endif
end subroutine double_hh_trafo_generic
! --------------------------------------------------------------------------------------------------
......@@ -109,30 +154,267 @@
subroutine hh_trafo_kernel_12_generic(q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck), intent(inout) :: q(ldq/2,*)
real(kind=rk), intent(in) :: hh(ldh,*), s
real(kind=rk), intent(in) :: h(ldh,*)
#else
real(kind=rk), intent(inout) :: q(:,:)
#endif
real(kind=rk), intent(in) :: hh(ldh,2), s
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=ck) :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
#else
real(kind=rk) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
#endif
real(kind=rk) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_12_generic")
#endif
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
#ifndef PACK_REAL_TO_COMPLEX
x7 = q(7,2)
x8 = q(8,2)
x9 = q(9,2)
x10 = q(10,2)
x11 = q(11,2)
x12 = q(12,2)
#endif
y1 = q(1 ,1) + q(1, 2)*hh(2,2)
y2 = q(2 ,1) + q(2, 2)*hh(2,2)
y3 = q(3 ,1) + q(3, 2)*hh(2,2)
y4 = q(4 ,1) + q(4, 2)*hh(2,2)
y5 = q(5 ,1) + q(5, 2)*hh(2,2)
y6 = q(6 ,1) + q(6, 2)*hh(2,2)
#ifndef PACK_REAL_TO_COMPLEX
y7 = q(7 ,1) + q(7, 2)*hh(2,2)
y8 = q(8 ,1) + q(8, 2)*hh(2,2)
y9 = q(9 ,1) + q(9, 2)*hh(2,2)
y10 = q(10,1) + q(10,2)*hh(2,2)
y11 = q(11,1) + q(11,2)*hh(2,2)
y12 = q(12,1) + q(12,2)*hh(2,2)
#endif
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
x1 = x1 + q(1, i)*h1
y1 = y1 + q(1, i)*h2
x2 = x2 + q(2, i)*h1
y2 = y2 + q(2, i)*h2
x3 = x3 + q(3, i)*h1
y3 = y3 + q(3, i)*h2
x4 = x4 + q(4, i)*h1
y4 = y4 + q(4, i)*h2
x5 = x5 + q(5, i)*h1
y5 = y5 + q(5, i)*h2
x6 = x6 + q(6, i)*h1
y6 = y6 + q(6, i)*h2
#ifndef PACK_REAL_TO_COMPLEX
x7 = x7 + q(7, i)*h1
y7 = y7 + q(7, i)*h2
x8 = x8 + q(8, i)*h1
y8 = y8 + q(8, i)*h2
x9 = x9 + q(9, i)*h1
y9 = y9 + q(9, i)*h2
x10 = x10 + q(10,i)*h1
y10 = y10 + q(10,i)*h2
x11 = x11 + q(11,i)*h1
y11 = y11 + q(11,i)*h2
x12 = x12 + q(12,i)*h1
y12 = y12 + q(12,i)*h2
#endif
enddo
x1 = x1 + q(1,nb+1)*hh(nb,1)
x2 = x2 + q(2,nb+1)*hh(nb,1)
x3 = x3 + q(3,nb+1)*hh(nb,1)
x4 = x4 + q(4,nb+1)*hh(nb,1)
x5 = x5 + q(5,nb+1)*hh(nb,1)
x6 = x6 + q(6,nb+1)*hh(nb,1)
#ifndef PACK_REAL_TO_COMPLEX
x7 = x7 + q(7, nb+1)*hh(nb,1)
x8 = x8 + q(8, nb+1)*hh(nb,1)
x9 = x9 + q(9, nb+1)*hh(nb,1)
x10 = x10 + q(10,nb+1)*hh(nb,1)
x11 = x11 + q(11,nb+1)*hh(nb,1)
x12 = x12 + q(12,nb+1)*hh(nb,1)
#endif
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1 *h1
x2 = x2 *h1
x3 = x3 *h1
x4 = x4 *h1
x5 = x5 *h1
x6 = x6 *h1
#ifndef PACK_REAL_TO_COMPLEX
x7 = x7 *h1
x8 = x8 *h1
x9 = x9 *h1
x10 = x10*h1
x11 = x11*h1
x12 = x12*h1
#endif
h1 = -tau2
h2 = -tau2*s
y1 = y1 *h1 + x1 *h2
y2 = y2 *h1 + x2 *h2
y3 = y3 *h1 + x3 *h2
y4 = y4 *h1 + x4 *h2
y5 = y5 *h1 + x5 *h2
y6 = y6 *h1 + x6 *h2
#ifndef PACK_REAL_TO_COMPLEX
y7 = y7 *h1 + x7 *h2
y8 = y8 *h1 + x8 *h2
y9 = y9 *h1 + x9 *h2
y10 = y10*h1 + x10*h2
y11 = y11*h1 + x11*h2
y12 = y12*h1 + x12*h2
#endif
q(1,1) = q(1, 1) + y1
q(2,1) = q(2, 1) + y2
q(3,1) = q(3, 1) + y3
q(4,1) = q(4, 1) + y4
q(5,1) = q(5, 1) + y5
q(6,1) = q(6, 1) + y6
#ifndef PACK_REAL_TO_COMPLEX
q(7 ,1) = q(7, 1) + y7
q(8 ,1) = q(8, 1) + y8
q(9 ,1) = q(9, 1) + y9
q(10,1) = q(10,1) + y10
q(11,1) = q(11,1) + y11
q(12,1) = q(12,1) + y12
#endif
q(1, 2) = q(1, 2) + x1 + y1 *hh(2,2)
q(2, 2) = q(2, 2) + x2 + y2 *hh(2,2)
q(3, 2) = q(3, 2) + x3 + y3 *hh(2,2)
q(4, 2) = q(4, 2) + x4 + y4 *hh(2,2)
q(5, 2) = q(5, 2) + x5 + y5 *hh(2,2)
q(6, 2) = q(6, 2) + x6 + y6 *hh(2,2)
#ifndef PACK_REAL_TO_COMPLEX
q(7, 2) = q(7, 2) + x7 + y7 *hh(2,2)
q(8, 2) = q(8, 2) + x8 + y8 *hh(2,2)
q(9, 2) = q(9, 2) + x9 + y9 *hh(2,2)
q(10,2) = q(10,2) + x10 + y10*hh(2,2)
q(11,2) = q(11,2) + x11 + y11*hh(2,2)
q(12,2) = q(12,2) + x12 + y12*hh(2,2)
#endif
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1, i) = q(1,i) + x1 *h1 + y1 *h2
q(2, i) = q(2,i) + x2 *h1 + y2 *h2
q(3, i) = q(3,i) + x3 *h1 + y3 *h2
q(4, i) = q(4,i) + x4 *h1 + y4 *h2
q(5, i) = q(5,i) + x5 *h1 + y5 *h2
q(6, i) = q(6,i) + x6 *h1 + y6 *h2
#ifndef PACK_REAL_TO_COMPLEX
q(7, i) = q(7, i) + x7 *h1 + y7 *h2
q(8, i) = q(8, i) + x8 *h1 + y8 *h2
q(9, i) = q(9, i) + x9 *h1 + y9 *h2
q(10,i) = q(10,i) + x10*h1 + y10*h2
q(11,i) = q(11,i) + x11*h1 + y11*h2
q(12,i) = q(12,i) + x12*h1 + y12*h2
#endif
enddo
q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
#ifndef PACK_REAL_TO_COMPLEX
q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_12_generic")
#endif
end subroutine hh_trafo_kernel_12_generic
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_kernel_8_generic(q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck), intent(inout) :: q(ldq/2,*)
real(kind=rk), intent(in) :: h(ldh,*)
#else
real(kind=rk), intent(inout) :: q(:,:)
#endif
real(kind=rk), intent(in) :: hh(ldh,2), s
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4
#else
real(kind=rk) :: x1, x2, x3, x4, x5, x6, x7, x8, &
y1, y2, y3, y4, y5, y6, y7, y8
#endif
real(kind=rk) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_8_generic")
#endif
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
#ifndef PACK_REAL_TO_COMPLEX
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
#endif
y1 = q(1,1) + q(1,2)*hh(2,2)
y2 = q(2,1) + q(2,2)*hh(2,2)
y3 = q(3,1) + q(3,2)*hh(2,2)
y4 = q(4,1) + q(4,2)*hh(2,2)
#ifndef PACK_REAL_TO_COMPLEX
y5 = q(5,1) + q(5,2)*hh(2,2)
y6 = q(6,1) + q(6,2)*hh(2,2)
y7 = q(7,1) + q(7,2)*hh(2,2)
y8 = q(8,1) + q(8,2)*hh(2,2)
#endif
!DEC$ VECTOR ALIGNED
do i=3,nb
......@@ -146,18 +428,28 @@
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
#ifndef PACK_REAL_TO_COMPLEX
x5 = x5 + q(5,i)*h1
y5 = y5 + q(5,i)*h2
x6 = x6 + q(6,i)*h1
y6 = y6 + q(6,i)*h2
x7 = x7 + q(7,i)*h1
y7 = y7 + q(7,i)*h2
x8 = x8 + q(8,i)*h1
y8 = y8 + q(8,i)*h2
#endif
enddo
x1 = x1 + q(1,nb+1)*hh(nb,1)
x2 = x2 + q(2,nb+1)*hh(nb,1)
x3 = x3 + q(3,nb+1)*hh(nb,1)
x4 = x4 + q(4,nb+1)*hh(nb,1)
#ifndef PACK_REAL_TO_COMPLEX
x5 = x5 + q(5,nb+1)*hh(nb,1)
x6 = x6 + q(6,nb+1)*hh(nb,1)
x7 = x7 + q(7,nb+1)*hh(nb,1)
x8 = x8 + q(8,nb+1)*hh(nb,1)
#endif
tau1 = hh(1,1)
tau2 = hh(1,2)
......@@ -167,29 +459,45 @@
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
#ifndef PACK_REAL_TO_COMPLEX
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
#endif
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
#ifndef PACK_REAL_TO_COMPLEX
y5 = y5*h1 + x5*h2
y6 = y6*h1 + x6*h2
y7 = y7*h1 + x7*h2
y8 = y8*h1 + x8*h2
#endif
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
#ifndef PACK_REAL_TO_COMPLEX
q(5,1) = q(5,1) + y5
q(6,1) = q(6,1) + y6
q(7,1) = q(7,1) + y7
q(8,1) = q(8,1) + y8
#endif
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
#ifndef PACK_REAL_TO_COMPLEX
q(5,2) = q(5,2) + x5 + y5*hh(2,2)
q(6,2) = q(6,2) + x6 + y6*hh(2,2)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
#endif
!DEC$ VECTOR ALIGNED
do i=3,nb
......@@ -199,43 +507,74 @@
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
#ifndef PACK_REAL_TO_COMPLEX
q(5,i) = q(5,i) + x5*h1 + y5*h2
q(6,i) = q(6,i) + x6*h1 + y6*h2
q(7,i) = q(7,i) + x7*h1 + y7*h2
q(8,i) = q(8,i) + x8*h1 + y8*h2
#endif
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
#ifndef PACK_REAL_TO_COMPLEX
q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
#endif
end subroutine hh_trafo_kernel_12_generic
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_8_generic")
#endif
end subroutine hh_trafo_kernel_8_generic
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_kernel_8_generic(q, hh, nb, ldq, ldh, s)
subroutine hh_trafo_kernel_4_generic(q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck), intent(inout) :: q(ldq/2,*)
real(kind=rk), intent(in) :: hh(ldh,*), s
real(kind=rk), intent(in) :: h(ldh,*)
#else
real(kind=rk), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1)
#endif
real(kind=rk), intent(in) :: hh(ldh,2), s
complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=ck) :: x1, x2, y1, y2
#else
real(kind=rk) :: x1, x2, x3, x4, y1, y2, y3, y4
#endif
real(kind=rk) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_4_generic")
#endif
x1 = q(1,2)
x2 = q(2,2)
#ifndef PACK_REAL_TO_COMPLEX
x3 = q(3,2)
x4 = q(4,2)
#endif
y1 = q(1,1) + q(1,2)*hh(2,2)
<