Commit 55b6158f authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify generic single and double precision complex kernel

parent e09ea6a5
......@@ -51,6 +51,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/redist_band.X90
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -771,6 +772,7 @@ EXTRA_DIST = \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/redist_band.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
......
......@@ -10,7 +10,7 @@
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
......@@ -63,1663 +63,22 @@ module complex_generic_kernel
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public single_hh_trafo_complex_generic_single
#endif
contains
! the Intel compiler creates a temp array copy of array q!
! this should be prevented, if possible without using assumed size arrays
subroutine single_hh_trafo_complex_generic_double(q, hh, nb, nq, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(*)
#else
complex(kind=ck8), intent(inout) :: q(1:ldq,1:nb)
complex(kind=ck8), intent(in) :: hh(1:nb)
#endif
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: single_hh_trafo_complex_generic_double")
#endif
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_12_double(q(i,1),hh, nb, ldq)
#else
call hh_trafo_complex_kernel_12_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
#endif
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_8_double(q(i,1),hh, nb, ldq)
#else
call hh_trafo_complex_kernel_8_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
#endif
else if(nq-i+1 > 0) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_4_double(q(i,1),hh, nb, ldq)
#else
call hh_trafo_complex_kernel_4_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
#endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: single_hh_trafo_complex_generic_double")
#endif
end subroutine single_hh_trafo_complex_generic_double
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex_generic_double(q, hh, nb, nq, ldq, ldh)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(ldh,*)
#else
complex(kind=ck8), intent(inout) :: q(1:ldq,1:nb+1)
complex(kind=ck8), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=ck8) :: s
integer(kind=ik) :: i
! Safety only:
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: double_hh_trafo_complex_generic_double")
#endif
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
s = conjg(hh(2,2)*1)
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq,4
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
call hh_trafo_complex_kernel_4_2hv_double(q(i,1),hh, nb, ldq, ldh, s)
#else
call hh_trafo_complex_kernel_4_2hv_double(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
enddo
!do i=1,nq-8,12
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
#else
! call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
!enddo
! i > nq-8 now, i.e. at most 8 rows remain
!if(nq-i+1 > 4) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
#else
! call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#endif
!else if(nq-i+1 > 0) then
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
! call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
#else
#endif
!endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: double_hh_trafo_complex_generic_double")
#endif
end subroutine double_hh_trafo_complex_generic_double
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12_double(q, hh, nb, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(*)
#else
complex(kind=ck8), intent(inout) :: q(:,:)
complex(kind=ck8), intent(in) :: hh(1:nb)
#endif
complex(kind=ck8) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
complex(kind=ck8) :: h1, tau1
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_complex_kernel_12_double")
#endif
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
x9 = q(9,1)
xa = q(10,1)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
x9 = x9 + q(9,i)*h1
xa = xa + q(10,i)*h1
xb = xb + q(11,i)*h1
xc = xc + q(12,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
xa = xa*h1
xb = xb*h1
xc = xc*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
q(9,1) = q(9,1) + x9
q(10,1) = q(10,1) + xa
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
q(9,i) = q(9,i) + x9*h1
q(10,i) = q(10,i) + xa*h1
q(11,i) = q(11,i) + xb*h1
q(12,i) = q(12,i) + xc*h1
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_complex_kernel_12_double")
#endif
end subroutine hh_trafo_complex_kernel_12_double
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8_double(q, hh, nb, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(*)
#else
complex(kind=ck8), intent(inout) :: q(:,:)
complex(kind=ck8), intent(in) :: hh(1:nb)
#endif
complex(kind=ck8) :: x1, x2, x3, x4, x5, x6, x7, x8
complex(kind=ck8) :: h1, tau1
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_complex_kernel_8_double")
#endif
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_complex_kernel_8_double")
#endif
end subroutine hh_trafo_complex_kernel_8_double
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4_double(q, hh, nb, ldq)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(*)
#else
complex(kind=ck8), intent(inout) :: q(:,:)
complex(kind=ck8), intent(in) :: hh(1:nb)
#endif
complex(kind=ck8) :: x1, x2, x3, x4
complex(kind=ck8) :: h1, tau1
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_complex_kernel_4_double")
#endif
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_complex_kernel_4_double")
#endif
end subroutine hh_trafo_complex_kernel_4_double
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4_2hv_double(q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(ldh,*)
#else
complex(kind=ck8), intent(inout) :: q(:,:)
complex(kind=ck8), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=ck8), intent(in) :: s
complex(kind=ck8) :: x1, x2, x3, x4, y1, y2, y3, y4
complex(kind=ck8) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv_double")
#endif
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv_double")
#endif
end subroutine hh_trafo_complex_kernel_4_2hv_double
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8_2hv_double(q, hh, nb, ldq, ldh, s)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck8), intent(inout) :: q(ldq,*)
complex(kind=ck8), intent(in) :: hh(ldh,*)
#else
complex(kind=ck8), intent(inout) :: q(:,:)
complex(kind=ck8), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=ck8), intent(in) :: s
complex(kind=ck8) :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
complex(kind=ck8) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv_double")
#endif
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
y6 = q(6,1) + q(6,2)*conjg(hh(2,2))