Commit af6fa0ea authored by Alexander Heinecke's avatar Alexander Heinecke

added SSE/AVX/FMA4 real back transformation kernels

parent 769e1f8f
......@@ -1693,16 +1693,58 @@ contains
subroutine compute_hh_trafo(off, ncols, istripe)
integer off, ncols, istripe, j, nl, jj
real*8 w(nbw,2), ttt
real*8 w(nbw,6), ttt
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
!FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
!do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!do j = ncols, 6, -6
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
......
ELPA generally uses BLAS-Routines for all compute intensive work
so that the performance of ELPA mainly depends on the quality of
the BLAS implementation used when linking.
The only exception is the backtransformation of the eigenvectors
for the 2-stage solver (ELPA2). In this case BLAS routines cannot
be used effectively due to the nature of the problem.
The compute intensive part of the backtransformation of ELPA2
has been put to a file of its own (elpa2_kernels.f90) so that
this can be replaced by hand tailored, optimized code for
specific platforms.
Currently we offer the following alternatives for the ELPA2 kernels:
* elpa2_kernels.f90 - The generic FORTRAN version of the ELPA2 kernels
which should be useable on every platform.
It contains some hand optimizations (loop unrolling)
in the hope to get optimal code from most FORTRAN
compilers.
* elpa2_kernels_simple.f90 - Plain and simple version of elpa2_kernels.f90.
Please note that we observed that some compilers get
get confused by the hand optimizations done in
elpa2_kernels.f90 and give better performance
with this version - so it is worth to try both!
* elpa2_kernels_bg.f90 - Fortran code enhanced with assembler calls
for the IBM BlueGene/P
* elpa2_tum_kernels_*.c - Optimized intrinisic code for x86_64
systems (i.e. Intel/AMD architecture)
using SSE2/SSE3 operations.
(Use gcc for compiling as Intel compiler generates slower code!)
So which version should be used?
================================
* On x86_64 systems (i.e. almost all Intel/AMD systems) or on the IBM BlueGene/P
you should get the optimal performance using the optimized intrinsics/assembler versions
in elpa2_tum_kernels_*.c or elpa2_kernels_bg.f90 respectively.
* If you don't compile for one of these systems or you don't like to use assembler
for any reason, it is likely that you are best off using elpa2_kernels.f90.
Make a perfomance test with elpa2_kernels_simple.f90, however, to check if
your compiler doesn't get confused by the hand optimizations.
* If you want to develop your own optimized kernels for you platform, it is
easier to start with elpa2_kernels_simple.f90.
Don't let you confuse from the huge code in elpa2_kernels.f90, the mathemathics
done in the kernels is relatively trivial.
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
! It should be compiled with the highest possible optimization level.
!
! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
implicit none
integer, intent(in) :: nb, nq, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
else if(nq-i+1 > 0) then
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
x9 = q(9,1)
xa = q(10,1)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
x9 = x9 + q(9,i)*h1
xa = xa + q(10,i)*h1
xb = xb + q(11,i)*h1
xc = xc + q(12,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
xa = xa*h1
xb = xb*h1
xc = xc*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
q(9,1) = q(9,1) + x9
q(10,1) = q(10,1) + xa
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
q(9,i) = q(9,i) + x9*h1
q(10,i) = q(10,i) + xa*h1
q(11,i) = q(11,i) + xb*h1
q(12,i) = q(12,i) + xc*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
enddo
end
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
implicit none
integer, intent(in) :: nb, nq, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
integer i
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do i=1,nq-8,12
call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if(nq-i+1 > 4) then
call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
else if(nq-i+1 > 0) then
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
x9 = q(9,1)
xa = q(10,1)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*h1
x7 = x7 + q(7,i)*h1
x8 = x8 + q(8,i)*h1
x9 = x9 + q(9,i)*h1
xa = xa + q(10,i)*h1
xb = xb + q(11,i)*h1
xc = xc + q(12,i)*h1
enddo
tau1 = hh(1)
h1 = -tau1
x1 = x1*h1
x2 = x2*h1
x3 = x3*h1
x4 = x4*h1
x5 = x5*h1
x6 = x6*h1
x7 = x7*h1
x8 = x8*h1
x9 = x9*h1
xa = xa*h1
xb = xb*h1
xc = xc*h1
q(1,1) = q(1,1) + x1
q(2,1) = q(2,1) + x2
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
q(5,1) = q(5,1) + x5
q(6,1) = q(6,1) + x6
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
q(9,1) = q(9,1) + x9
q(10,1) = q(10,1) + xa
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
q(2,i) = q(2,i) + x2*h1
q(3,i) = q(3,i) + x3*h1
q(4,i) = q(4,i) + x4*h1
q(5,i) = q(5,i) + x5*h1
q(6,i) = q(6,i) + x6*h1
q(7,i) = q(7,i) + x7*h1
q(8,i) = q(8,i) + x8*h1
q(9,i) = q(9,i) + x9*h1
q(10,i) = q(10,i) + xa*h1
q(11,i) = q(11,i) + xb*h1
q(12,i) = q(12,i) + xc*h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none
integer, intent(in) :: nb, ldq
complex*16, intent(inout) :: q(ldq,*)
complex*16, intent(in) :: hh(*)
complex*16 x1, x2, x3, x4, x5, x6, x7, x8
complex*16 h1, tau1
integer i
x1 = q(1,1)
x2 = q(2,1)
x3 = q(3,1)
x4 = q(4,1)
x5 = q(5,1)
x6 = q(6,1)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
x2 = x2 + q(2,i)*h1
x3 = x3 + q(3,i)*h1
x4 = x4 + q(4,i)*h1
x5 = x5 + q(5,i)*h1
x6 = x6 + q(6,i)*