Commit 7decfbb3 authored by Rainer Johanni's avatar Rainer Johanni
Browse files

OpenMP enhancements for ELPA2 and new kernels.

Added OpenMP enhancements to elpa2.f90
Removed MPI-2 dependencies, ELPA should now work with pure MPI-1
Added x86_64 optimized assembler version of the kernels.
Added a straight forward and simple version of the kernels.
Added a description of the kernels.
parent d3a2f621
ELPA generally uses BLAS-Routines for all compute intensive work
so that the performance of ELPA mainly depends on the quality of
the BLAS implementation used when linking.
The only exception is the backtransformation of the eigenvectors
for the 2-stage solver (ELPA2). In this case BLAS routines cannot
be used effectively due to the nature of the problem.
The compute intensive part of the backtransformation of ELPA2
has been put to a file of its own (elpa2_kernels.f90) so that
this can be replaced by hand tailored, optimized code for
specific platforms.
Currently we offer the following alternatives for the ELPA2 kernels:
* elpa2_kernels.f90 - The generic FORTRAN version of the ELPA2 kernels
which should be useable on every platform.
It contains some hand optimizations (loop unrolling)
in the hope to get optimal code from most FORTRAN
compilers.
* elpa2_kernels_simple.f90 - Plain and simple version of elpa2_kernels.f90.
Please note that we observed that some compilers get
get confused by the hand optimizations done in
elpa2_kernels.f90 and give better performance
with this version - so it is worth to try both!
* elpa2_kernels_bg.f90 - Fortran code enhanced with assembler calls
for the IBM BlueGene/P
* elpa2_kernels_asm_x86_64.s - Optimized assembler code for x86_64
systems (i.e. Intel/AMD architecture)
using SSE2/SSE3 operations.
(Use GNU assembler for assembling!)
So which version should be used?
================================
* On x86_64 systems (i.e. almost all Intel/AMD systems) or on the IBM BlueGene/P
you should get the optimal performance using the optimized assembler versions
in elpa2_kernels_asm_x86_64.s or elpa2_kernels_bg.f90 respectively.
* If you don't compile for one of these systems or you don't like to use assembler
for any reason, it is likely that you are best off using elpa2_kernels.f90.
Make a perfomance test with elpa2_kernels_simple.f90, however, to check if
your compiler doesn't get confused by the hand optimizations.
* If you want to develop your own optimized kernels for you platform, it is
easier to start with elpa2_kernels_simple.f90.
Don't let you confuse from the huge code in elpa2_kernels.f90, the mathemathics
done in the kernels is relatively trivial.
This diff is collapsed.
# --------------------------------------------------------------------------------------------------
#
# This file contains the compute intensive kernels for the Householder transformations,
# coded in x86_64 assembler and using SSE2/SSE3 instructions.
#
# It must be assembled with GNU assembler (just "as" on most Linux machines)
#
# Copyright of the original code rests with the authors inside the ELPA
# consortium. The copyright of any additional modifications shall rest
# with their original authors, but shall adhere to the licensing terms
# distributed along with the original code in the file "COPYING".
#
# --------------------------------------------------------------------------------------------------
.globl double_hh_trafo_
.globl single_hh_trafo_complex_
.text
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.macro hh_trafo_real nrows
# When this macro is called, the following registers are set and must not be changed
# %rdi: Address of q
# %rsi: Address of hh
# %rdx: nb
# %rcx: Remaining rows nq
# %r8: ldq in bytes
# %r9: ldh in bytes
# %rax: address of hh at the end of the loops
# The top of the stack must contain the dot product of the two Householder vectors
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
# x1 = q(1,2)
# x2 = q(2,2)
#
# y1 = q(1,1) + q(1,2)*hh(2,2)
# y2 = q(2,1) + q(2,2)*hh(2,2)
movaps (%r10), %xmm6 # y1 = q(1,1)
movaps 16(%r10), %xmm7 # y2 = q(2,1)
.if \nrows>=8
movaps 32(%r10), %xmm8
movaps 48(%r10), %xmm9
.if \nrows==12
movaps 64(%r10), %xmm10
movaps 80(%r10), %xmm11
.endif
.endif
addq %r8, %r10 # %r10 => q(.,2)
movddup 8(%r11,%r9), %xmm15 # hh(2,2)
.macro mac_pre_loop1 qoff, X, Y
movaps \qoff(%r10), \X # xn = q(n,2)
movaps \X, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + xn*h(2,2)
.endm
mac_pre_loop1 0, %xmm0, %xmm6
mac_pre_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_pre_loop1 32, %xmm2, %xmm8
mac_pre_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_pre_loop1 64, %xmm4, %xmm10
mac_pre_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_pre_loop1
# do i=3,nb
# h1 = hh(i-1,1)
# h2 = hh(i,2)
# x1 = x1 + q(1,i)*h1
# y1 = y1 + q(1,i)*h2
# x2 = x2 + q(2,i)*h1
# y2 = y2 + q(2,i)*h2
# ...
# enddo
addq $8, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # hh(i-1,1)
movddup 8(%r11,%r9), %xmm15 # hh(i,2)
.macro mac_loop1 qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i)
movaps %xmm13, %xmm12
mulpd %xmm14, %xmm13
addpd %xmm13, \X # xn = xn + q(.,i)*h1
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + q(.,i)*h2
.endm
mac_loop1 0, %xmm0, %xmm6
mac_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_loop1 32, %xmm2, %xmm8
mac_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_loop1 64, %xmm4, %xmm10
mac_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop1
addq $8, %r11
jmp 1b
2:
# x1 = x1 + q(1,nb+1)*hh(nb,1)
# x2 = x2 + q(2,nb+1)*hh(nb,1)
addq %r8, %r10 # %r10 => q(.,nb+1)
movddup (%r11), %xmm14
.macro mac_post_loop1 qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1)
mulpd %xmm14, %xmm13
addpd %xmm13, \X
.endm
mac_post_loop1 0, %xmm0
mac_post_loop1 16, %xmm1
.if \nrows>=8
mac_post_loop1 32, %xmm2
mac_post_loop1 48, %xmm3
.if \nrows==12
mac_post_loop1 64, %xmm4
mac_post_loop1 80, %xmm5
.endif
.endif
.purgem mac_post_loop1
# tau1 = hh(1,1)
# tau2 = hh(1,2)
#
# h1 = -tau1
# x1 = x1*h1
# x2 = x2*h1
movq %rsi, %r11 # restore %r11 (hh(1,1))
movddup (%r11), %xmm12 # hh(1,1)
xorps %xmm14, %xmm14
subpd %xmm12, %xmm14 # %xmm14 = -hh(1,1)
mulpd %xmm14, %xmm0
mulpd %xmm14, %xmm1
.if \nrows>=8
mulpd %xmm14, %xmm2
mulpd %xmm14, %xmm3
.if \nrows==12
mulpd %xmm14, %xmm4
mulpd %xmm14, %xmm5
.endif
.endif
# h1 = -tau2
# h2 = -tau2*s
# y1 = y1*h1 + x1*h2
# y2 = y2*h1 + x2*h2
movddup (%r11,%r9), %xmm12 # hh(1,2)
xorps %xmm15, %xmm15
subpd %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
movaps %xmm15, %xmm14
movddup (%rsp), %xmm12 # Get s from top of stack
mulpd %xmm12, %xmm14 # %xmm14 = h2
.macro mac_xform_y X, Y
mulpd %xmm15, \Y # y1 = y1*h1
movaps \X, %xmm12
mulpd %xmm14, %xmm12
addpd %xmm12, \Y
.endm
mac_xform_y %xmm0, %xmm6
mac_xform_y %xmm1, %xmm7
.if \nrows>=8
mac_xform_y %xmm2, %xmm8
mac_xform_y %xmm3, %xmm9
.if \nrows==12
mac_xform_y %xmm4, %xmm10
mac_xform_y %xmm5, %xmm11
.endif
.endif
.purgem mac_xform_y
# q(1,1) = q(1,1) + y1
# q(2,1) = q(2,1) + y2
movq %rdi, %r10 # restore original Q
.macro mac_pre_loop2_1 qoff, Y
movaps \qoff(%r10), %xmm13 # q(.,1)
addpd \Y, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_1 0, %xmm6
mac_pre_loop2_1 16, %xmm7
.if \nrows>=8
mac_pre_loop2_1 32, %xmm8
mac_pre_loop2_1 48, %xmm9
.if \nrows==12
mac_pre_loop2_1 64, %xmm10
mac_pre_loop2_1 80, %xmm11
.endif
.endif
.purgem mac_pre_loop2_1
# q(1,2) = q(1,2) + x1 + y1*hh(2,2)
# q(2,2) = q(2,2) + x2 + y2*hh(2,2)
addq %r8, %r10 # %r10 => q(.,2)
movddup 8(%r11,%r9), %xmm15 # hh(2,2)
.macro mac_pre_loop2_2 qoff, X, Y
movaps \X, %xmm13
movaps \Y, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, %xmm13
addpd \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_2 0, %xmm0, %xmm6
mac_pre_loop2_2 16, %xmm1, %xmm7
.if \nrows>=8
mac_pre_loop2_2 32, %xmm2, %xmm8
mac_pre_loop2_2 48, %xmm3, %xmm9
.if \nrows==12
mac_pre_loop2_2 64, %xmm4, %xmm10
mac_pre_loop2_2 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_pre_loop2_2
# do i=3,nb
# h1 = hh(i-1,1)
# h2 = hh(i,2)
# q(1,i) = q(1,i) + x1*h1 + y1*h2
# q(2,i) = q(2,i) + x2*h1 + y2*h2
# enddo
addq $8, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # hh(i-1,1)
movddup 8(%r11,%r9), %xmm15 # hh(i,2)
.macro mac_loop2 qoff, X, Y
movaps \X, %xmm13
mulpd %xmm14, %xmm13
movaps \Y, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, %xmm13
addpd \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_loop2 0, %xmm0, %xmm6
mac_loop2 16, %xmm1, %xmm7
.if \nrows>=8
mac_loop2 32, %xmm2, %xmm8
mac_loop2 48, %xmm3, %xmm9
.if \nrows==12
mac_loop2 64, %xmm4, %xmm10
mac_loop2 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop2
addq $8, %r11
jmp 1b
2:
# q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
# q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
addq %r8, %r10 # %r10 => q(.,nb+1)
movddup (%r11), %xmm14
.macro mac_post_loop2 qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1)
mulpd %xmm14, \X
addpd \X, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_post_loop2 0, %xmm0
mac_post_loop2 16, %xmm1
.if \nrows>=8
mac_post_loop2 32, %xmm2
mac_post_loop2 48, %xmm3
.if \nrows==12
mac_post_loop2 64, %xmm4
mac_post_loop2 80, %xmm5
.endif
.endif
.purgem mac_post_loop2
.endm
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# FORTRAN Interface:
#
# subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh)
#
# integer, intent(in) :: nb, nq, ldq, ldh
# real*8, intent(inout) :: q(ldq,*)
# real*8, intent(in) :: hh(ldh,*)
#
# Parameter mapping to registers
# parameter 1: %rdi : q
# parameter 2: %rsi : hh
# parameter 3: %rdx : nb
# parameter 4: %rcx : nq
# parameter 5: %r8 : ldq
# parameter 6: %r9 : ldh
#
#-------------------------------------------------------------------------------
.align 16,0x90
double_hh_trafo_:
# Get integer parameters into corresponding registers
movslq (%rdx), %rdx # nb
movslq (%rcx), %rcx # nq
movslq (%r8), %r8 # ldq
movslq (%r9), %r9 # ldh
# Get ldq in bytes
addq %r8, %r8
addq %r8, %r8
addq %r8, %r8 # 8*ldq, i.e. ldq in bytes
# Get ldh in bytes
addq %r9, %r9
addq %r9, %r9
addq %r9, %r9 # 8*ldh, i.e. ldh in bytes
# set %rax to the address of hh at the end of the loops,
# i.e. if %rdx >= %rax we must jump out of the loop.
# please note: %rax = 8*%rdx + %rsi - 8
movq %rdx, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rsi, %rax
subq $8, %rax
#-----------------------------------------------------------
# Calculate the dot product of the two Householder vectors
# decrement stack pointer to make space for s
subq $8, %rsp
# Fortran code:
# s = hh(2,2)*1
# do i=3,nb
# s = s+hh(i,2)*hh(i-1,1)
# enddo
movq %rsi, %r11 # Copy address of hh
movsd 8(%r11,%r9), %xmm0 # hh(2,2)
addq $8, %r11
1:
cmpq %rax, %r11
jge 2f
movsd (%r11), %xmm14 # hh(i-1,1)
movsd 8(%r11,%r9), %xmm15 # hh(i,2)
mulsd %xmm14, %xmm15
addsd %xmm15, %xmm0
addq $8, %r11
jmp 1b
2:
movsd %xmm0, (%rsp) # put s on top of stack
#-----------------------------------------------------------
rloop_s:
cmpq $8, %rcx # if %rcx <= 8 jump out of loop
jle rloop_e
hh_trafo_real 12 # transform 12 rows
addq $96, %rdi # increment q start adress by 96 bytes (6 rows)
subq $12, %rcx # decrement nq
jmp rloop_s
rloop_e:
cmpq $4, %rcx # if %rcx <= 4 jump to test_2
jle test_4
hh_trafo_real 8 # transform 8 rows
jmp return1
test_4:
cmpq $0, %rcx # if %rcx <= 0 jump to return
jle return1
hh_trafo_real 4 # transform 4 rows
return1:
addq $8, %rsp # reset stack pointer
ret
.align 16,0x90
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.macro hh_trafo_complex nrows
# When this macro is called, the following registers are set and must not be changed
# %rdi: Address of q
# %rsi: Address of hh
# %rdx: nb
# %rcx: Remaining rows nq
# %r8: ldq in bytes
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
# set %rax to the address of hh at the end of the loops,
# i.e. if %rdx >= %rax we must jump out of the loop.
# please note: %rax = 16*%rdx + %rsi
movq %rdx, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rsi, %rax
# x1 = q(1,1); y1 = 0
# x2 = q(2,1); y2 = 0
# ...
movaps (%r10), %xmm0
movaps 16(%r10), %xmm1
xorps %xmm6, %xmm6
xorps %xmm7, %xmm7
.if \nrows>=4
movaps 32(%r10), %xmm2
movaps 48(%r10), %xmm3
xorps %xmm8, %xmm8
xorps %xmm9, %xmm9
.if \nrows==6
movaps 64(%r10), %xmm4
movaps 80(%r10), %xmm5
xorps %xmm10, %xmm10
xorps %xmm11, %xmm11
.endif
.endif
# do i=2,nb
# h1 = conjg(hh(i))
# x1 = x1 + q(1,i)*h1
# x2 = x2 + q(2,i)*h1
# ...
# enddo
addq $16, %r11 # %r11 => hh(2)
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # real(hh(i))
movddup 8(%r11), %xmm15 # imag(hh(i))
.macro mac_loop1 qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i)
movaps %xmm13, %xmm12
mulpd %xmm14, %xmm13 # q(.,i)*real(hh(i))
addpd %xmm13, \X # x1 = x1 + q(.,i)*real(hh(i))
mulpd %xmm15, %xmm12 # q(.,i)*imag(hh(i))
addsubpd %xmm12, \Y # y1 = y1 -/+ q(.,i)*imag(hh(i))
.endm
mac_loop1 0, %xmm0, %xmm6
mac_loop1 16, %xmm1, %xmm7
.if \nrows>=4
mac_loop1 32, %xmm2, %xmm8
mac_loop1 48, %xmm3, %xmm9
.if \nrows==6
mac_loop1 64, %xmm4, %xmm10
mac_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop1
addq $16, %r11 # %r11 => hh(i+1)
jmp 1b
2:
# Now the content of the yn has to be swapped and added to xn
.macro mac_post_loop_1 X, Y
shufpd $1, \Y, \Y
addpd \Y, \X
.endm
mac_post_loop_1 %xmm0, %xmm6
mac_post_loop_1 %xmm1, %xmm7