Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
f16576db
Commit
f16576db
authored
Jun 25, 2014
by
Lorenz Huedepohl
Browse files
Replace inline SSE test with actual source file
parent
e67c3ac7
Changes
2
Hide whitespace changes
Inline
Side-by-side
ELPA_2014.06/configure
View file @
f16576db
...
...
@@ -5022,580 +5022,17 @@ install_complex_avx_block2=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether SSE assembler kernel can be compiled" >&5
$as_echo_n "checking whether SSE assembler kernel can be compiled... " >&6; }
echo <<EOF > ./test.s
.globl double_hh_trafo_
.globl single_hh_trafo_complex_
.text
.macro hh_trafo_real nrows
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
movaps (%r10), %xmm6 # y1 = q(1,1)
movaps 16(%r10), %xmm7 # y2 = q(2,1)
.if \nrows>=8
movaps 32(%r10), %xmm8
movaps 48(%r10), %xmm9
.if \nrows==12
movaps 64(%r10), %xmm10
movaps 80(%r10), %xmm11
.endif
.endif
addq %r8, %r10 # %r10 => q(.,2)
movddup 8(%r11,%r9), %xmm15 # hh(2,2)
.macro mac_pre_loop1 qoff, X, Y
movaps \qoff(%r10), \X # xn = q(n,2)
movaps \X, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + xn*h(2,2)
.endm
mac_pre_loop1 0, %xmm0, %xmm6
mac_pre_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_pre_loop1 32, %xmm2, %xmm8
mac_pre_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_pre_loop1 64, %xmm4, %xmm10
mac_pre_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_pre_loop1
addq \$8, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # hh(i-1,1)
movddup 8(%r11,%r9), %xmm15 # hh(i,2)
.macro mac_loop1 qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i)
movaps %xmm13, %xmm12
mulpd %xmm14, %xmm13
addpd %xmm13, \X # xn = xn + q(.,i)*h1
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + q(.,i)*h2
.endm
mac_loop1 0, %xmm0, %xmm6
mac_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_loop1 32, %xmm2, %xmm8
mac_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_loop1 64, %xmm4, %xmm10
mac_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop1
addq \$8, %r11
jmp 1b
2:
addq %r8, %r10 # %r10 => q(.,nb+1)
movddup (%r11), %xmm14
.macro mac_post_loop1 qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1)
mulpd %xmm14, %xmm13
addpd %xmm13, \X
.endm
mac_post_loop1 0, %xmm0
mac_post_loop1 16, %xmm1
.if \nrows>=8
mac_post_loop1 32, %xmm2
mac_post_loop1 48, %xmm3
.if \nrows==12
mac_post_loop1 64, %xmm4
mac_post_loop1 80, %xmm5
.endif
.endif
.purgem mac_post_loop1
movq %rsi, %r11 # restore %r11 (hh(1,1))
movddup (%r11), %xmm12 # hh(1,1)
xorps %xmm14, %xmm14
subpd %xmm12, %xmm14 # %xmm14 = -hh(1,1)
mulpd %xmm14, %xmm0
mulpd %xmm14, %xmm1
.if \nrows>=8
mulpd %xmm14, %xmm2
mulpd %xmm14, %xmm3
.if \nrows==12
mulpd %xmm14, %xmm4
mulpd %xmm14, %xmm5
.endif
.endif
movddup (%r11,%r9), %xmm12 # hh(1,2)
xorps %xmm15, %xmm15
subpd %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
movaps %xmm15, %xmm14
movddup (%rsp), %xmm12 # Get s from top of stack
mulpd %xmm12, %xmm14 # %xmm14 = h2
.macro mac_xform_y X, Y
mulpd %xmm15, \Y # y1 = y1*h1
movaps \X, %xmm12
mulpd %xmm14, %xmm12
addpd %xmm12, \Y
.endm
mac_xform_y %xmm0, %xmm6
mac_xform_y %xmm1, %xmm7
.if \nrows>=8
mac_xform_y %xmm2, %xmm8
mac_xform_y %xmm3, %xmm9
.if \nrows==12
mac_xform_y %xmm4, %xmm10
mac_xform_y %xmm5, %xmm11
.endif
.endif
.purgem mac_xform_y
movq %rdi, %r10 # restore original Q
.macro mac_pre_loop2_1 qoff, Y
movaps \qoff(%r10), %xmm13 # q(.,1)
addpd \Y, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_1 0, %xmm6
mac_pre_loop2_1 16, %xmm7
.if \nrows>=8
mac_pre_loop2_1 32, %xmm8
mac_pre_loop2_1 48, %xmm9
.if \nrows==12
mac_pre_loop2_1 64, %xmm10
mac_pre_loop2_1 80, %xmm11
.endif
.endif
.purgem mac_pre_loop2_1
addq %r8, %r10 # %r10 => q(.,2)
movddup 8(%r11,%r9), %xmm15 # hh(2,2)
.macro mac_pre_loop2_2 qoff, X, Y
movaps \X, %xmm13
movaps \Y, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, %xmm13
addpd \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_2 0, %xmm0, %xmm6
mac_pre_loop2_2 16, %xmm1, %xmm7
.if \nrows>=8
mac_pre_loop2_2 32, %xmm2, %xmm8
mac_pre_loop2_2 48, %xmm3, %xmm9
.if \nrows==12
mac_pre_loop2_2 64, %xmm4, %xmm10
mac_pre_loop2_2 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_pre_loop2_2
addq \$8, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # hh(i-1,1)
movddup 8(%r11,%r9), %xmm15 # hh(i,2)
.macro mac_loop2 qoff, X, Y
movaps \X, %xmm13
mulpd %xmm14, %xmm13
movaps \Y, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, %xmm13
addpd \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_loop2 0, %xmm0, %xmm6
mac_loop2 16, %xmm1, %xmm7
.if \nrows>=8
mac_loop2 32, %xmm2, %xmm8
mac_loop2 48, %xmm3, %xmm9
.if \nrows==12
mac_loop2 64, %xmm4, %xmm10
mac_loop2 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop2
addq \$8, %r11
jmp 1b
2:
addq %r8, %r10 # %r10 => q(.,nb+1)
movddup (%r11), %xmm14
.macro mac_post_loop2 qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1)
mulpd %xmm14, \X
addpd \X, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_post_loop2 0, %xmm0
mac_post_loop2 16, %xmm1
.if \nrows>=8
mac_post_loop2 32, %xmm2
mac_post_loop2 48, %xmm3
.if \nrows==12
mac_post_loop2 64, %xmm4
mac_post_loop2 80, %xmm5
.endif
.endif
.purgem mac_post_loop2
.endm
.align 16,0x90
double_hh_trafo_:
movslq (%rdx), %rdx # nb
movslq (%rcx), %rcx # nq
movslq (%r8), %r8 # ldq
movslq (%r9), %r9 # ldh
# Get ldq in bytes
addq %r8, %r8
addq %r8, %r8
addq %r8, %r8 # 8*ldq, i.e. ldq in bytes
# Get ldh in bytes
addq %r9, %r9
addq %r9, %r9
addq %r9, %r9 # 8*ldh, i.e. ldh in bytes
movq %rdx, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rsi, %rax
subq \$8, %rax
subq \$8, %rsp
movq %rsi, %r11 # Copy address of hh
movsd 8(%r11,%r9), %xmm0 # hh(2,2)
addq \$8, %r11
1:
cmpq %rax, %r11
jge 2f
movsd (%r11), %xmm14 # hh(i-1,1)
movsd 8(%r11,%r9), %xmm15 # hh(i,2)
mulsd %xmm14, %xmm15
addsd %xmm15, %xmm0
addq \$8, %r11
jmp 1b
2:
movsd %xmm0, (%rsp) # put s on top of stack
#-----------------------------------------------------------
rloop_s:
cmpq \$8, %rcx # if %rcx <= 8 jump out of loop
jle rloop_e
hh_trafo_real 12 # transform 12 rows
addq \$96, %rdi # increment q start adress by 96 bytes (6 rows)
subq \$12, %rcx # decrement nq
jmp rloop_s
rloop_e:
cmpq \$4, %rcx # if %rcx <= 4 jump to test_2
jle test_4
hh_trafo_real 8 # transform 8 rows
jmp return1
test_4:
cmpq \$0, %rcx # if %rcx <= 0 jump to return
jle return1
hh_trafo_real 4 # transform 4 rows
return1:
addq \$8, %rsp # reset stack pointer
ret
.align 16,0x90
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.macro hh_trafo_complex nrows
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
# set %rax to the address of hh at the end of the loops,
# i.e. if %rdx >= %rax we must jump out of the loop.
# please note: %rax = 16*%rdx + %rsi
movq %rdx, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rax, %rax
addq %rsi, %rax
# x1 = q(1,1); y1 = 0
# x2 = q(2,1); y2 = 0
# ...
movaps (%r10), %xmm0
movaps 16(%r10), %xmm1
xorps %xmm6, %xmm6
xorps %xmm7, %xmm7
.if \nrows>=4
movaps 32(%r10), %xmm2
movaps 48(%r10), %xmm3
xorps %xmm8, %xmm8
xorps %xmm9, %xmm9
.if \nrows==6
movaps 64(%r10), %xmm4
movaps 80(%r10), %xmm5
xorps %xmm10, %xmm10
xorps %xmm11, %xmm11
.endif
.endif
addq \$16, %r11 # %r11 => hh(2)
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # real(hh(i))
movddup 8(%r11), %xmm15 # imag(hh(i))
.macro mac_loop1 qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i)
movaps %xmm13, %xmm12
mulpd %xmm14, %xmm13 # q(.,i)*real(hh(i))
addpd %xmm13, \X # x1 = x1 + q(.,i)*real(hh(i))
mulpd %xmm15, %xmm12 # q(.,i)*imag(hh(i))
addsubpd %xmm12, \Y # y1 = y1 -/+ q(.,i)*imag(hh(i))
.endm
mac_loop1 0, %xmm0, %xmm6
mac_loop1 16, %xmm1, %xmm7
.if \nrows>=4
mac_loop1 32, %xmm2, %xmm8
mac_loop1 48, %xmm3, %xmm9
.if \nrows==6
mac_loop1 64, %xmm4, %xmm10
mac_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop1
addq \$16, %r11 # %r11 => hh(i+1)
jmp 1b
2:
# Now the content of the yn has to be swapped and added to xn
.macro mac_post_loop_1 X, Y
shufpd \$1, \Y, \Y
addpd \Y, \X
.endm
mac_post_loop_1 %xmm0, %xmm6
mac_post_loop_1 %xmm1, %xmm7
.if \nrows>=4
mac_post_loop_1 %xmm2, %xmm8
mac_post_loop_1 %xmm3, %xmm9
.if \nrows==6
mac_post_loop_1 %xmm4, %xmm10
mac_post_loop_1 %xmm5, %xmm11
.endif
.endif
.purgem mac_post_loop_1
# tau1 = hh(1)
#
# h1 = -tau1
# x1 = x1*h1; y1 = x1 with halfes exchanged
# x2 = x2*h1; y2 = x2 with halfes exchanged
# ...
movq %rsi, %r11 # restore address of hh
xorps %xmm14, %xmm14
movddup (%r11), %xmm12 # real(hh(1))
subpd %xmm12, %xmm14 #-real(hh(1))
xorps %xmm15, %xmm15
movddup 8(%r11), %xmm12 # imag(hh(1))
subpd %xmm12, %xmm15 #-imag(hh(1))
.macro mac_xform X, Y
movaps \X, %xmm12
shufpd \$1, \X, %xmm12
mulpd %xmm15, %xmm12
mulpd %xmm14, \X
addsubpd %xmm12, \X
movaps \X, \Y # copy to y
shufpd \$1, \X, \Y # exchange halfes
.endm
mac_xform %xmm0, %xmm6
mac_xform %xmm1, %xmm7
.if \nrows>=4
mac_xform %xmm2, %xmm8
mac_xform %xmm3, %xmm9
.if \nrows==6
mac_xform %xmm4, %xmm10
mac_xform %xmm5, %xmm11
.endif
.endif
.purgem mac_xform
# q(1,1) = q(1,1) + x1
# q(2,1) = q(2,1) + x2
# ...
movq %rdi, %r10 # restore address of q
.macro mac_pre_loop2 qoff, X
movaps \qoff(%r10), %xmm13 # q(.,1)
addpd \X, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2 0, %xmm0
mac_pre_loop2 16, %xmm1
.if \nrows>=4
mac_pre_loop2 32, %xmm2
mac_pre_loop2 48, %xmm3
.if \nrows==6
mac_pre_loop2 64, %xmm4
mac_pre_loop2 80, %xmm5
.endif
.endif
.purgem mac_pre_loop2
# do i=2,nb
# h1 = hh(i)
# q(1,i) = q(1,i) + x1*h1
# q(2,i) = q(2,i) + x2*h1
# ...
# enddo
addq \$16, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # real(hh(i))
movddup 8(%r11), %xmm15 # imag(hh(i))
.macro mac_loop2 qoff, X, Y
movaps \X, %xmm13
mulpd %xmm14, %xmm13
movaps \Y, %xmm12
mulpd %xmm15, %xmm12
addsubpd %xmm12, %xmm13
addpd \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_loop2 0, %xmm0, %xmm6
mac_loop2 16, %xmm1, %xmm7
.if \nrows>=4
mac_loop2 32, %xmm2, %xmm8
mac_loop2 48, %xmm3, %xmm9
.if \nrows==6
mac_loop2 64, %xmm4, %xmm10
mac_loop2 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_loop2
addq \$16, %r11
jmp 1b
2:
.endm
.align 16,0x90
single_hh_trafo_complex_:
# Get integer parameters into corresponding registers
movslq (%rdx), %rdx # nb
movslq (%rcx), %rcx # nq
movslq (%r8), %r8 # ldq
# Get ldq in bytes
addq %r8, %r8
addq %r8, %r8
addq %r8, %r8
addq %r8, %r8 # 16*ldq, i.e. ldq in bytes
cloop_s:
cmpq \$4, %rcx # if %rcx <= 4 jump out of loop
jle cloop_e
hh_trafo_complex 6 # transform 6 rows
addq \$96, %rdi # increment q start adress by 96 bytes (6 rows)
subq \$6, %rcx # decrement nq
jmp cloop_s
cloop_e:
cmpq \$2, %rcx # if %rcx <= 2 jump to test_2
jle test_2
hh_trafo_complex 4 # transform 4 rows
jmp return2
test_2:
cmpq \$0, %rcx # if %rcx <= 0 jump to return
jle return2
hh_trafo_complex 2 # transform 2 rows
return2:
ret
.align 16,0x90
EOF
$CC -c ./test.s
if test "$?" == 0; then
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_complex_sse=yes
else
else
can_compile_sse=no
install_real_sse=no
install_complex_sse=no
fi
rm -f ./test.s ./test.o
fi
rm -f ./test.o
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse}" >&5
$as_echo "${can_compile_sse}" >&6; }
...
...
ELPA_2014.06/configure.ac
View file @
f16576db
...
...
@@ -90,580 +90,17 @@ install_complex_avx_block1=no
install_complex_avx_block2=no
AC_MSG_CHECKING(whether SSE assembler kernel can be compiled)
echo <<EOF > ./test.s
.globl double_hh_trafo_
.globl single_hh_trafo_complex_
.text
.macro hh_trafo_real nrows
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
movaps (%r10), %xmm6 # y1 = q(1,1)
movaps 16(%r10), %xmm7 # y2 = q(2,1)
.if \nrows>=8
movaps 32(%r10), %xmm8
movaps 48(%r10), %xmm9
.if \nrows==12
movaps 64(%r10), %xmm10
movaps 80(%r10), %xmm11
.endif
.endif
addq %r8, %r10 # %r10 => q(.,2)
movddup 8(%r11,%r9), %xmm15 # hh(2,2)
.macro mac_pre_loop1 qoff, X, Y
movaps \qoff(%r10), \X # xn = q(n,2)
movaps \X, %xmm12
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + xn*h(2,2)
.endm
mac_pre_loop1 0, %xmm0, %xmm6
mac_pre_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_pre_loop1 32, %xmm2, %xmm8
mac_pre_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_pre_loop1 64, %xmm4, %xmm10
mac_pre_loop1 80, %xmm5, %xmm11
.endif
.endif
.purgem mac_pre_loop1
addq \$8, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
movddup (%r11), %xmm14 # hh(i-1,1)
movddup 8(%r11,%r9), %xmm15 # hh(i,2)
.macro mac_loop1 qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i)
movaps %xmm13, %xmm12
mulpd %xmm14, %xmm13
addpd %xmm13, \X # xn = xn + q(.,i)*h1
mulpd %xmm15, %xmm12
addpd %xmm12, \Y # yn = yn + q(.,i)*h2
.endm
mac_loop1 0, %xmm0, %xmm6
mac_loop1 16, %xmm1, %xmm7
.if \nrows>=8
mac_loop1 32, %xmm2, %xmm8
mac_loop1 48, %xmm3, %xmm9
.if \nrows==12
mac_loop1 64, %xmm4, %xmm10