Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
c9172d13
Commit
c9172d13
authored
Mar 25, 2017
by
Andreas Marek
Browse files
Cleanup of real generic kernel
parent
719457df
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/compute_hh_trafo.X90
View file @
c9172d13
...
...
@@ -296,12 +296,16 @@
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_generic_&
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_generic_&
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
...
...
@@ -310,12 +314,16 @@
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_generic_&
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_generic_&
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
...
...
src/elpa2_kernels/elpa2_kernels_real.F90
View file @
c9172d13
...
...
@@ -64,32 +64,29 @@
module
real_generic_kernel
private
public
double_hh_trafo_generic_double
public
double_hh_trafo_
real_
generic_double
#ifdef WANT_SINGLE_PRECISION_REAL
public
double_hh_trafo_generic_single
public
double_hh_trafo_
real_
generic_single
#endif
contains
#endif
#define
DOUBLE_PRECISION_
REAL 1
#define
REAL_DATATYPE rk8
#
define COMPLEX_DATATYPE ck8
#define REAL
CASE
1
#define
DOUBLE_PRECISION 1
#
include "../precision_macros.h"
#include "elpa2_kernels_real_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef COMPLEX_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#
un
def
DOUBLE_PRECISION_REAL
#define
REAL_DATATYPE rk4
#
define COMPLEX_DATATYPE ck4
#def
ine REALCASE 1
#define
SINGLE_PRECISION 1
#
include "../precision_macros.h"
#include "elpa2_kernels_real_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef COMPLEX_DATATYPE
#undef REALCASE
#undef SINGLE_PRECISION
#endif
#ifndef USE_ASSUMED_SIZE
...
...
src/elpa2_kernels/elpa2_kernels_real_template.X90
View file @
c9172d13
...
...
@@ -58,43 +58,43 @@
! the intel compiler creates a temp copy of array q!
! this should be prevented if possible without using assumed size arrays
subroutine double_hh_trafo_&
&MATH_DATATYPE&
&_generic_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
#ifdef DOUBLE_PRECISION_REAL
subroutine double_hh_trafo_generic_double(q, hh, nb, nq, ldq, ldh)
#else
subroutine double_hh_trafo_generic_single(q, hh, nb, nq, ldq, ldh)
#endif
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use iso_c_binding
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
integer(kind=ik), intent(in)
:: nb, nq, ldq, ldh
#ifdef USE_ASSUMED_SIZE
real(kind=
REAL
_DATATYPE), intent(inout)
:: q(ldq,*)
real(kind=
REAL
_DATATYPE), intent(in)
:: hh(ldh,*)
real(kind=
C
_DATATYPE
_KIND
), intent(inout) :: q(ldq,*)
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: hh(ldh,*)
#else
real(kind=
REAL
_DATATYPE), intent(inout)
:: q(1:ldq,1:nb+1)
real(kind=
REAL
_DATATYPE), intent(in)
:: hh(1:ldh,1:6)
real(kind=
C
_DATATYPE
_KIND
), intent(inout) :: q(1:ldq,1:nb+1)
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=
REAL
_DATATYPE
)
:: s
integer(kind=ik) :: i
real(kind=
C
_DATATYPE
_KIND)
:: s
integer(kind=ik)
:: i
! equivalence(q(1,1),q_complex(1,1))
! Safety only:
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: double_hh_trafo_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: double_hh_trafo_generic_single")
#endif
#endif
call timer%start("kernel generic: double_hh_trafo_&
&MATH_DATATYPE&
&_generic" // &
&PRECISION_SUFFIX &
)
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
...
...
@@ -113,86 +113,55 @@
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
#ifdef DOUBLE_PRECISION_REAL
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_12_generic_double(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_12_generic_double(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#else /* DOUBLE_PRECISION_REAL */
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_12_generic_single(q(i,1),hh, nb, ldq, ldh, s)
call hh_trafo_kernel_12_generic_&
&PRECISION&
& (q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_12_generic_single(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
call hh_trafo_kernel_12_generic_&
&PRECISION&
& (q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#endif /* DOUBLE_PRECISION_REAL */
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
#ifdef DOUBLE_PRECISION_REAL
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_8_generic_double(q(i,1),hh, nb, ldq, ldh, s)
call hh_trafo_kernel_8_generic_&
&PRECISION&
& (q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_8_generic_double(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
call hh_trafo_kernel_8_generic_&
&PRECISION&
& (q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#else /* DOUBLE_PRECISION_REAL */
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_8_generic_single(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_8_generic_single(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#endif /* DOUBLE_PRECISION_REAL */
else if(nq-i+1 > 0) then
#ifdef DOUBLE_PRECISION_REAL
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_4_generic_double(q(i,1),hh, nb, ldq, ldh, s)
call hh_trafo_kernel_4_generic_&
&PRECISION&
& (q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_4_generic_double(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
call hh_trafo_kernel_4_generic_&
&PRECISION&
& (q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#else /* DOUBLE_PRECISION_REAL */
#ifdef USE_ASSUMED_SIZE
call hh_trafo_kernel_4_generic_single(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_kernel_4_generic_single(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
#endif /* DOUBLE_PRECISION_REAL */
endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: double_hh_trafo_generic_double")
#endif
call timer%stop("kernel generic: double_hh_trafo_&
&MATH_DATATYPE&
&_generic" // &
&PRECISION_SUFFIX &
)
#els
e
end subroutin
e
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: double_hh_trafo_generic_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_REAL
end subroutine double_hh_trafo_generic_double
#else
end subroutine double_hh_trafo_generic_single
#endif
! --------------------------------------------------------------------------------------------------
! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
! Please note that Q is declared complex*16 here.
...
...
@@ -200,50 +169,44 @@
! (relevant for Intel SSE and BlueGene double hummer CPUs).
! --------------------------------------------------------------------------------------------------
#ifdef DOUBLE_PRECISION_REAL
subroutine hh_trafo_kernel_12_generic_double(q, hh, nb, ldq, ldh, s)
#else
subroutine hh_trafo_kernel_12_generic_single(q, hh, nb, ldq, ldh, s)
#endif
subroutine hh_trafo_kernel_12_generic_&
&PRECISION&
& (q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
real(kind=REAL_DATATYPE), intent(in) :: hh(ldh,*)
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
#endif
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=
REAL
_DATATYPE), intent(inout)
:: q(:,:)
real(kind=
REAL
_DATATYPE), intent(in)
:: hh(ldh,2)
real(kind=
C
_DATATYPE
_KIND
), intent(inout) :: q(:,:)
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: hh(ldh,2)
#endif
real(kind=
REAL
_DATATYPE), intent(in)
:: s
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: s
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=COMPLEX_DATATYPE)
:: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
complex(kind=
SPECIAL_
COMPLEX_DATATYPE) :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
#else
real(kind=
REAL
_DATATYPE
)
:: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
real(kind=
C
_DATATYPE
_KIND)
:: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
#endif
real(kind=
REAL
_DATATYPE) :: h1, h2, tau1, tau2
real(kind=
C
_DATATYPE
_KIND
) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_12_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_12_generic_single")
#endif
#endif
call timer%start("kernel generic: hh_trafo_kernel_12_generic" // &
&PRECISION_SUFFIX &
)
x1 = q(1,2)
x2 = q(2,2)
...
...
@@ -433,55 +396,49 @@
q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
#endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_12_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_12_generic_single")
#endif
call timer%stop("kernel generic: hh_trafo_kernel_12_generic" // &
&PRECISION_SUFFIX &
)
#endif
#ifdef DOUBLE_PRECISION_REAL
end subroutine hh_trafo_kernel_12_generic_double
#else
end subroutine hh_trafo_kernel_12_generic_single
#endif
end subroutine
! --------------------------------------------------------------------------------------------------
#ifdef DOUBLE_PRECISION_REAL
subroutine hh_trafo_kernel_8_generic_double(q, hh, nb, ldq, ldh, s)
#else
subroutine hh_trafo_kernel_8_generic_single(q, hh, nb, ldq, ldh, s)
#endif
subroutine hh_trafo_kernel_8_generic_&
&PRECISION&
& (q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
real(kind=REAL_DATATYPE), intent(in) :: hh(ldh,*)
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
#endif
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=
REAL
_DATATYPE), intent(inout)
:: q(:,:)
real(kind=
REAL
_DATATYPE), intent(in)
:: hh(ldh,2)
real(kind=
C
_DATATYPE
_KIND
), intent(inout):: q(:,:)
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: hh(ldh,2)
#endif
real(kind=
REAL
_DATATYPE), intent(in)
:: s
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: s
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=COMPLEX_DATATYPE)
:: x1, x2, x3, x4, y1, y2, y3, y4
complex(kind=
SPECIAL_
COMPLEX_DATATYPE) :: x1, x2, x3, x4, y1, y2, y3, y4
#else
real(kind=
REAL
_DATATYPE
)
:: x1, x2, x3, x4, x5, x6, x7, x8, &
real(kind=
C
_DATATYPE
_KIND)
:: x1, x2, x3, x4, x5, x6, x7, x8, &
y1, y2, y3, y4, y5, y6, y7, y8
#endif
real(kind=
REAL
_DATATYPE) :: h1, h2, tau1, tau2
real(kind=
C
_DATATYPE
_KIND
) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_8_generic_double")
#endif
call timer%start("kernel generic: hh_trafo_kernel_8_generic" // &
&PRECISION_SUFFIX &
)
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
...
...
@@ -623,62 +580,51 @@
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
#endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_8_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_8_generic_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_REAL
end subroutine hh_trafo_kernel_8_generic_double
#else
end subroutine hh_trafo_kernel_8_generic_single
#
end
if
call timer%stop("kernel generic: hh_trafo_kernel_8_generic" // &
&PRECISION_SUFFIX &
)
end
subroutine
! --------------------------------------------------------------------------------------------------
#ifdef DOUBLE_PRECISION_REAL
subroutine hh_trafo_kernel_4_generic_double(q, hh, nb, ldq, ldh, s)
#else
subroutine hh_trafo_kernel_4_generic_single(q, hh, nb, ldq, ldh, s)
#endif
subroutine hh_trafo_kernel_4_generic_&
&PRECISION&
& (q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
real(kind=REAL_DATATYPE), intent(in) :: hh(ldh,*)
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
#endif
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=
REAL
_DATATYPE), intent(inout)
:: q(:,:) !q(1:ldq/2,1:nb+1)
real(kind=
REAL
_DATATYPE), intent(in)
:: hh(ldh,2)
real(kind=
C
_DATATYPE
_KIND
), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1)
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: hh(ldh,2)
#endif
real(kind=
REAL
_DATATYPE), intent(in)
:: s
real(kind=
C
_DATATYPE
_KIND
), intent(in) :: s
#ifdef PACK_REAL_TO_COMPLEX
complex(kind=COMPLEX_DATATYPE)
:: x1, x2, y1, y2
complex(kind=
SPECIAL_
COMPLEX_DATATYPE) :: x1, x2, y1, y2
#else
real(kind=
REAL
_DATATYPE
)
:: x1, x2, x3, x4, y1, y2, y3, y4
real(kind=
C
_DATATYPE
_KIND)
:: x1, x2, x3, x4, y1, y2, y3, y4
#endif
real(kind=
REAL
_DATATYPE
)
:: h1, h2, tau1, tau2
real(kind=
C
_DATATYPE
_KIND)
:: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_4_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_4_generic_single")
#endif
#endif
call timer%start("kernel generic: hh_trafo_kernel_4_generic" // &
&PRECISION_SUFFIX &
)
x1 = q(1,2)
x2 = q(2,2)
#ifndef PACK_REAL_TO_COMPLEX
...
...
@@ -776,18 +722,8 @@
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
#endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_4_generic_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_kernel_4_generic_single")
#endif
#endif
call timer%stop("kernel generic: hh_trafo_kernel_4_generic" // &
&PRECISION_SUFFIX &
)
#ifdef DOUBLE_PRECISION_REAL
end subroutine hh_trafo_kernel_4_generic_double
#else
end subroutine hh_trafo_kernel_4_generic_single
#endif
end subroutine
src/precision_macros.h
View file @
c9172d13
...
...
@@ -4,6 +4,7 @@
#undef MATH_DATATYPE
#undef BLAS_TRANS_OR_CONJ
#undef PRECISION
#undef SPECIAL_COMPLEX_DATATYPE
#undef PRECISION_STR
#undef REAL_DATATYPE
...
...
@@ -55,7 +56,7 @@
#define PRECISION_STR 'double'
#define PRECISION_SUFFIX "_double"
#define REAL_DATATYPE rk8
#define SPECIAL_COMPLEX_DATATYPE ck8
#define PRECISION_TRTRI DTRTRI
#define PRECISION_POTRF DPOTRF
...
...
@@ -101,6 +102,7 @@
#define PRECISION_STR 'single'
#define PRECISION_SUFFIX "_single"
#define REAL_DATATYPE rk4
#define SPECIAL_COMPLEX_DATATYPE ck4
#define PRECISION_TRTRI STRTRI
#define PRECISION_POTRF SPOTRF
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment