Commit 0f93fec7 authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

finalized integration of x86 intrinisic kernels

parent 870ac371
......@@ -2027,50 +2027,30 @@ contains
if(nl<=0) return
endif
!FORTRAN CODE
!FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 2 HOUSEHOLDER VECTORS
!do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
!do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!do j = ncols, 6, -6
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
......@@ -2078,31 +2058,19 @@ contains
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! if (mod(nl,8) == 0) then
! call double_hh_trafo_6hv_fast(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
......
......@@ -28,18 +28,18 @@ Currently we offer the following alternatives for the ELPA2 kernels:
* elpa2_kernels_bg.f90 - Fortran code enhanced with assembler calls
for the IBM BlueGene/P
* elpa2_kernels_asm_x86_64.s - Optimized assembler code for x86_64
* elpa2_tum_kernels_*.c - Optimized intrinisic code for x86_64
systems (i.e. Intel/AMD architecture)
using SSE2/SSE3 operations.
(Use GNU assembler for assembling!)
(Use gcc for compiling as Intel compiler generates slower code!)
So which version should be used?
================================
* On x86_64 systems (i.e. almost all Intel/AMD systems) or on the IBM BlueGene/P
you should get the optimal performance using the optimized assembler versions
in elpa2_kernels_asm_x86_64.s or elpa2_kernels_bg.f90 respectively.
you should get the optimal performance using the optimized intrinsics/assembler versions
in elpa2_tum_kernels_*.c or elpa2_kernels_bg.f90 respectively.
* If you don't compile for one of these systems or you don't like to use assembler
for any reason, it is likely that you are best off using elpa2_kernels.f90.
......
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
! It should be compiled with the highest possible optimization level.
!
! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
implicit none
integer, intent(in) :: nb, nq, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
else if(nq-i+1 > 0) then
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
x9 = q(9,1)
xa = q(10,1)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
x9 = x9 + q(9,i)*h1
xa = xa + q(10,i)*h1
xb = xb + q(11,i)*h1
xc = xc + q(12,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
xa = xa*h1
xb = xb*h1
xc = xc*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
q(9,1) = q(9,1) + x9
q(10,1) = q(10,1) + xa
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
q(9,i) = q(9,i) + x9*h1
q(10,i) = q(10,i) + xa*h1
q(11,i) = q(11,i) + xb*h1
q(12,i) = q(12,i) + xc*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
enddo
end
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
implicit none
integer, intent(in) :: nb, nq, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
else if(nq-i+1 > 0) then
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
x9 = q(9,1)
xa = q(10,1)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
x9 = x9 + q(9,i)*h1
xa = xa + q(10,i)*h1
xb = xb + q(11,i)*h1
xc = xc + q(12,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
xa = xa*h1
xb = xb*h1
xc = xc*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
q(9,1) = q(9,1) + x9
q(10,1) = q(10,1) + xa
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
q(9,i) = q(9,i) + x9*h1
q(10,i) = q(10,i) + xa*h1
q(11,i) = q(11,i) + xb*h1
q(12,i) = q(12,i) + xc*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1