From 1da1bd50a35b7a3ee39bfeb0e06e6428c76ce244 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Thu, 28 Jan 2016 09:34:14 +0100 Subject: [PATCH] Remove assumed size arrays from generic complex kernel This change might be performance critical and has to be timed carefully. Thus it is possible to switch back to the old implementation. The new one, however, can actually be debbuged --- src/elpa2_kernels/elpa2_kernels_complex.F90 | 73 +++++++++++++++++++-- src/mod_compute_hh_trafo_complex.F90 | 13 ++++ src/mod_compute_hh_trafo_real.F90 | 6 +- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/src/elpa2_kernels/elpa2_kernels_complex.F90 b/src/elpa2_kernels/elpa2_kernels_complex.F90 index 85a35e5..02efee2 100644 --- a/src/elpa2_kernels/elpa2_kernels_complex.F90 +++ b/src/elpa2_kernels/elpa2_kernels_complex.F90 @@ -69,8 +69,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, nq, ldq +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) +#else + complex(kind=ck), intent(inout) :: q(1:ldq,1:nb) + complex(kind=ck), intent(in) :: hh(1:nb) +#endif integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS @@ -86,15 +91,27 @@ contains ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller do i=1,nq-8,12 +#ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq) +#else + call hh_trafo_complex_kernel_12(q(i:ldq,1:nb),hh(1:nb), nb, ldq) +#endif enddo ! i > nq-8 now, i.e. at most 8 rows remain if(nq-i+1 > 4) then +#ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq) +#else + call hh_trafo_complex_kernel_8(q(i:ldq,1:nb),hh(1:nb), nb, ldq) +#endif else if(nq-i+1 > 0) then +#ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq) +#else + call hh_trafo_complex_kernel_4(q(i:ldq,1:nb),hh(1:nb), nb, ldq) +#endif endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: single_hh_trafo_complex_generic") @@ -111,8 +128,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) +#else + complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1) + complex(kind=ck), intent(in) :: hh(1:ldh,1:2) +#endif complex(kind=ck) :: s integer(kind=ik) :: i @@ -136,19 +158,35 @@ contains ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller do i=1,nq,4 +#ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s) +#else + call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) +#endif enddo !do i=1,nq-8,12 +#ifdef DESPERATELY_WANT_ASSUMED_SIZE ! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s) +#else + ! call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) +#endif !enddo ! i > nq-8 now, i.e. at most 8 rows remain !if(nq-i+1 > 4) then +#ifdef DESPERATELY_WANT_ASSUMED_SIZE ! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s) +#else + ! call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) +#endif !else if(nq-i+1 > 0) then - ! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + ! call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) +#else + +#endif !endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: double_hh_trafo_complex_generic") @@ -166,9 +204,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) - +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:nb) +#endif complex(kind=ck) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i @@ -269,9 +311,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) - +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:nb) +#endif complex(kind=ck) :: x1, x2, x3, x4, x5, x6, x7, x8 complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i @@ -351,9 +397,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) - +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:nb) +#endif complex(kind=ck) :: x1, x2, x3, x4 complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i @@ -412,8 +462,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:ldh,1:2) +#endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4 @@ -506,8 +561,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:ldh,1:2) +#endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8 @@ -647,8 +707,13 @@ contains implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh +#ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) +#else + complex(kind=ck), intent(inout) :: q(:,:) + complex(kind=ck), intent(in) :: hh(1:ldh,1:2) +#endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, & diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90 index 9cdd501..c768255 100644 --- a/src/mod_compute_hh_trafo_complex.F90 +++ b/src/mod_compute_hh_trafo_complex.F90 @@ -150,12 +150,25 @@ module compute_hh_trafo_complex ttt = mpi_wtime() do j = ncols, 1, -1 #ifdef WITH_OPENMP +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), & bcast_buffer(1,j+off),nbw,nl,stripe_width) #else + call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), & + bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) +#endif + +#else /* WITH_OPENMP */ +#ifdef DESPERATELY_WANT_ASSUMED_SIZE call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), & bcast_buffer(1,j+off),nbw,nl,stripe_width) +#else + call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), & + bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width) #endif +#endif /* WITH_OPENMP */ + enddo #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) endif diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90 index cc642e2..3cf7e18 100644 --- a/src/mod_compute_hh_trafo_real.F90 +++ b/src/mod_compute_hh_trafo_real.F90 @@ -131,7 +131,7 @@ module compute_hh_trafo_real #else call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, & istripe,my_thread), w(1:nbw,1:6), & - nbw, nl, stripe_width, nbw) + nbw, nl, stripe_width, nbw) #endif #else /* WITH_OPENMP */ @@ -141,8 +141,8 @@ module compute_hh_trafo_real nbw, nl, stripe_width, nbw) #else - call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw+1,istripe),w(1:nbw,1:6), & - nbw, nl, stripe_width, nbw) + call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), & + nbw, nl, stripe_width, nbw) #endif #endif /* WITH_OPENMP */ -- GitLab