Commit 16114466 authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

moved new kernels into development branch

parent 8d197e00
......@@ -24,7 +24,7 @@ LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_l
##CC=icc -O3
##CCOPT=$(CC) -msse3
#MKL_HOME=/opt/intel/mkl/lib/intel64
#LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
#LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64 -lstdc++
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) and GCC 4.6 with FMA4 for AMD Bulldozer:
......@@ -36,7 +36,7 @@ LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_l
#CCOPT=$(CC) -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize -mfma4 -mxop -march=bdver1 -D__USE_AVX128__
##CCOPT=$(CC) -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize -mfma4 -mxop -march=bdver1
##LIBS = -L/opt/acml5.0.0/gfortran64_fma4/lib/ -lacml -lgfortran libscalapack.a
#LIBS = -L/lrz/sys/libraries/acml/5.2.0/ifort64_fma4_mp/lib -lacml_mp -lgfortran libscalapack.a
#LIBS = -L/lrz/sys/libraries/acml/5.2.0/ifort64_fma4_mp/lib -lacml_mp -lgfortran libscalapack.a -lsdtc++
#
# ------------------------------------------------------------------------------
# Settings for Intel Fortran (Linux) old 11.x Toolchain, !!!!!!! do not use !!!!!!:
......
......@@ -2050,7 +2050,7 @@ contains
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......@@ -2060,12 +2060,12 @@ contains
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! call quad_hh_trafo(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......@@ -2077,19 +2077,19 @@ contains
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! call hexa_hh_trafo(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! call quad_hh_trafo(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......
......@@ -46,6 +46,51 @@ end
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)
implicit none
integer, intent(in) :: nb, nq, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16 s
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
s = conjg(hh(2,2)*1)
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq,4
call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
enddo
!do i=1,nq-8,12
! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
!enddo
! i > nq-8 now, i.e. at most 8 rows remain
!if(nq-i+1 > 4) then
! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
!else if(nq-i+1 > 0) then
! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
!endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
......@@ -264,3 +309,394 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, y1, y2, y3, y4
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
x5 = x5 + q(5,i)*h1
y5 = y5 + q(5,i)*h2
x6 = x6 + q(6,i)*h1
y6 = y6 + q(6,i)*h2
x7 = x7 + q(7,i)*h1
y7 = y7 + q(7,i)*h2
x8 = x8 + q(8,i)*h1
y8 = y8 + q(8,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
y5 = y5*h1 + x5*h2
y6 = y6*h1 + x6*h2
y7 = y7*h1 + x7*h2
y8 = y8*h1 + x8*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(5,1) = q(5,1) + y5
q(6,1) = q(6,1) + y6
q(7,1) = q(7,1) + y7
q(8,1) = q(8,1) + y8
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
q(5,2) = q(5,2) + x5 + y5*hh(2,2)
q(6,2) = q(6,2) + x6 + y6*hh(2,2)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
q(5,i) = q(5,i) + x5*h1 + y5*h2
q(6,i) = q(6,i) + x6*h1 + y6*h2
q(7,i) = q(7,i) + x7*h1 + y7*h2
q(8,i) = q(8,i) + x8*h1 + y8*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
implicit none
integer, intent(in) :: nb, ldq, ldh
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(ldh,*)
complex*16, intent(in) :: s
complex*16 x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
complex*16 h1, h2, tau1, tau2
integer i
x1 = q(1,2)
x2 = q(2,2)
x3 = q(3,2)
x4 = q(4,2)
x5 = q(5,2)
x6 = q(6,2)
x7 = q(7,2)
x8 = q(8,2)
x9 = q(9,2)
x10 = q(10,2)
x11 = q(11,2)
x12 = q(12,2)
y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
y9 = q(9,1) + q(9,2)*conjg(hh(2,2))
y10 = q(10,1) + q(10,2)*conjg(hh(2,2))
y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
x1 = x1 + q(1,i)*h1
y1 = y1 + q(1,i)*h2
x2 = x2 + q(2,i)*h1
y2 = y2 + q(2,i)*h2
x3 = x3 + q(3,i)*h1
y3 = y3 + q(3,i)*h2
x4 = x4 + q(4,i)*h1
y4 = y4 + q(4,i)*h2
x5 = x5 + q(5,i)*h1
y5 = y5 + q(5,i)*h2
x6 = x6 + q(6,i)*h1
y6 = y6 + q(6,i)*h2
x7 = x7 + q(7,i)*h1
y7 = y7 + q(7,i)*h2
x8 = x8 + q(8,i)*h1
y8 = y8 + q(8,i)*h2
x9 = x9 + q(9,i)*h1
y9 = y9 + q(9,i)*h2
x10 = x10 + q(10,i)*h1
y10 = y10 + q(10,i)*h2
x11 = x11 + q(11,i)*h1
y11 = y11 + q(11,i)*h2
x12 = x12 + q(12,i)*h1
y12 = y12 + q(12,i)*h2
enddo
x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
x9 = x9 + q(9,nb+1)*conjg(hh(nb,1))
x10 = x10 + q(10,nb+1)*conjg(hh(nb,1))
x11 = x11 + q(11,nb+1)*conjg(hh(nb,1))
x12 = x12 + q(12,nb+1)*conjg(hh(nb,1))
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
x10 = x10*h1
x11 = x11*h1
x12 = x12*h1
h1 = -tau2
h2 = -tau2*s
y1 = y1*h1 + x1*h2
y2 = y2*h1 + x2*h2
y3 = y3*h1 + x3*h2
y4 = y4*h1 + x4*h2
y5 = y5*h1 + x5*h2
y6 = y6*h1 + x6*h2
y7 = y7*h1 + x7*h2
y8 = y8*h1 + x8*h2
y9 = y9*h1 + x9*h2
y10 = y10*h1 + x10*h2
y11 = y11*h1 + x11*h2
y12 = y12*h1 + x12*h2
q(1,1) = q(1,1) + y1
q(2,1) = q(2,1) + y2
q(3,1) = q(3,1) + y3
q(4,1) = q(4,1) + y4
q(5,1) = q(5,1) + y5
q(6,1) = q(6,1) + y6
q(7,1) = q(7,1) + y7
q(8,1) = q(8,1) + y8
q(9,1) = q(9,1) + y9
q(10,1) = q(10,1) + y10
q(11,1) = q(11,1) + y11
q(12,1) = q(12,1) + y12
q(1,2) = q(1,2) + x1 + y1*hh(2,2)
q(2,2) = q(2,2) + x2 + y2*hh(2,2)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
q(5,2) = q(5,2) + x5 + y5*hh(2,2)
q(6,2) = q(6,2) + x6 + y6*hh(2,2)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
q(9,2) = q(9,2) + x9 + y9*hh(2,2)
q(10,2) = q(10,2) + x10 + y10*hh(2,2)
q(11,2) = q(11,2) + x11 + y11*hh(2,2)
q(12,2) = q(12,2) + x12 + y12*hh(2,2)
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1,i) = q(1,i) + x1*h1 + y1*h2
q(2,i) = q(2,i) + x2*h1 + y2*h2
q(3,i) = q(3,i) + x3*h1 + y3*h2
q(4,i) = q(4,i) + x4*h1 + y4*h2
q(5,i) = q(5,i) + x5*h1 + y5*h2
q(6,i) = q(6,i) + x6*h1 + y6*h2
q(7,i) = q(7,i) + x7*h1 + y7*h2
q(8,i) = q(8,i) + x8*h1 + y8*h2
q(9,i) = q(9,i) + x9*h1 + y9*h2
q(10,i) = q(10,i) + x10*h1 + y10*h2
q(11,i) = q(11,i) + x11*h1 + y11*h2
q(12,i) = q(12,i) + x12*h1 + y12*h2
enddo
q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
q(9,nb+1) = q(9,nb+1) + x9*hh(nb,1)
q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
end
! --------------------------------------------------------------------------------------------------
......@@ -45,3 +45,58 @@ subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
end
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)