Commit 24c49dc4 authored by Thomas Auckenthaler's avatar Thomas Auckenthaler
Browse files

Merge branch 'master' of elpa-lib.fhi-berlin.mpg.de:/pub/elpa_lib

parents 737e8df5 952b8091
......@@ -1703,7 +1703,7 @@ contains
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......@@ -1713,12 +1713,12 @@ contains
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
! call quad_hh_trafo(a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......@@ -1730,19 +1730,19 @@ contains
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
! call hexa_hh_trafo(a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
! call quad_hh_trafo(a(1,jj+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......@@ -2787,6 +2787,7 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r
a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
allocate(a(stripe_width,a_dim2,stripe_count))
a(:,:,:) = 0
......@@ -3197,6 +3198,25 @@ contains
subroutine compute_hh_trafo(off, ncols, istripe)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! integer off, ncols, istripe, j, nl, jj
! real*8 ttt
! complex*16 w(nbw,2)
!
! ttt = mpi_wtime()
! nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! call double_hh_trafo_complex(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! enddo
! if(j==1) call single_hh_trafo_complex(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
integer off, ncols, istripe, j, nl, jj
real*8 ttt
......@@ -3205,6 +3225,7 @@ contains
do j = ncols, 1, -1
call single_hh_trafo_complex(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
......
......@@ -46,6 +46,51 @@ end
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)
implicit none
integer, intent(in) :: nb, nq, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16 s
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
s = conjg(hh(2,2)*1)
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq,4
call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
enddo
!do i=1,nq-8,12
! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
!enddo
! i > nq-8 now, i.e. at most 8 rows remain
!if(nq-i+1 > 4) then
! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
!else if(nq-i+1 > 0) then
! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
!endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
......@@ -264,3 +309,394 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, y1, y2, y3, y4
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
x5 = x5 + q(5,i)*h1
y5 = y5 + q(5,i)*h2
x6 = x6 + q(6,i)*h1
y6 = y6 + q(6,i)*h2
x7 = x7 + q(7,i)*h1
y7 = y7 + q(7,i)*h2
x8 = x8 + q(8,i)*h1
y8 = y8 + q(8,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
y5 = y5*h1 + x5*h2
y6 = y6*h1 + x6*h2
y7 = y7*h1 + x7*h2
y8 = y8*h1 + x8*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(5,1) = q(5,1) + y5
q(6,1) = q(6,1) + y6
q(7,1) = q(7,1) + y7
q(8,1) = q(8,1) + y8
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
q(5,2) = q(5,2) + x5 + y5*hh(2,2)
q(6,2) = q(6,2) + x6 + y6*hh(2,2)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
q(5,i) = q(5,i) + x5*h1 + y5*h2
q(6,i) = q(6,i) + x6*h1 + y6*h2
q(7,i) = q(7,i) + x7*h1 + y7*h2
q(8,i) = q(8,i) + x8*h1 + y8*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
x9 = q(9,2)
x10 = q(10,2)
x11 = q(11,2)
x12 = q(12,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
y9 = q(9,1) + q(9,2)*conjg(hh(2,2))
y10 = q(10,1) + q(10,2)*conjg(hh(2,2))
y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
x5 = x5 + q(5,i)*h1
y5 = y5 + q(5,i)*h2
x6 = x6 + q(6,i)*h1
y6 = y6 + q(6,i)*h2
x7 = x7 + q(7,i)*h1
y7 = y7 + q(7,i)*h2
x8 = x8 + q(8,i)*h1
y8 = y8 + q(8,i)*h2
x9 = x9 + q(9,i)*h1
y9 = y9 + q(9,i)*h2
x10 = x10 + q(10,i)*h1
y10 = y10 + q(10,i)*h2
x11 = x11 + q(11,i)*h1
y11 = y11 + q(11,i)*h2
x12 = x12 + q(12,i)*h1
y12 = y12 + q(12,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
x9 = x9 + q(9,nb+1)*conjg(hh(nb,1))
x10 = x10 + q(10,nb+1)*conjg(hh(nb,1))
x11 = x11 + q(11,nb+1)*conjg(hh(nb,1))
x12 = x12 + q(12,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
x10 = x10*h1
x11 = x11*h1
x12 = x12*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
y5 = y5*h1 + x5*h2
y6 = y6*h1 + x6*h2
y7 = y7*h1 + x7*h2
y8 = y8*h1 + x8*h2
y9 = y9*h1 + x9*h2
y10 = y10*h1 + x10*h2
y11 = y11*h1 + x11*h2
y12 = y12*h1 + x12*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(5,1) = q(5,1) + y5
q(6,1) = q(6,1) + y6
q(7,1) = q(7,1) + y7
q(8,1) = q(8,1) + y8
q(9,1) = q(9,1) + y9
q(10,1) = q(10,1) + y10
q(11,1) = q(11,1) + y11
q(12,1) = q(12,1) + y12
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
q(5,2) = q(5,2) + x5 + y5*hh(2,2)
q(6,2) = q(6,2) + x6 + y6*hh(2,2)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
q(9,2) = q(9,2) + x9 + y9*hh(2,2)
q(10,2) = q(10,2) + x10 + y10*hh(2,2)
q(11,2) = q(11,2) + x11 + y11*hh(2,2)
q(12,2) = q(12,2) + x12 + y12*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
q(5,i) = q(5,i) + x5*h1 + y5*h2
q(6,i) = q(6,i) + x6*h1 + y6*h2
q(7,i) = q(7,i) + x7*h1 + y7*h2
q(8,i) = q(8,i) + x8*h1 + y8*h2
q(9,i) = q(9,i) + x9*h1 + y9*h2
q(10,i) = q(10,i) + x10*h1 + y10*h2
q(11,i) = q(11,i) + x11*h1 + y11*h2
q(12,i) = q(12,i) + x12*h1 + y12*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1)