Single precision assembly kernel for ELPA2 real and complex

The single precision version of the SSE assembly kernel is about 1.8
times faster than the double precision version
parent c66afc1c
......@@ -81,10 +81,17 @@ if WITH_REAL_BGQ_KERNEL
endif
if WITH_REAL_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
else
if WITH_COMPLEX_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
endif
endif
......@@ -332,6 +339,7 @@ elpa2_test_real_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_LDADD = $(build_lib)
......@@ -349,6 +357,7 @@ elpa2_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib)
......
......@@ -101,6 +101,14 @@ AM_PROG_AR
AM_PROG_AS
# Fortran
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
......@@ -190,9 +198,9 @@ if test x"${with_ftimings}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
AC_MSG_CHECKING(whether SSE assembler kernel can be compiled)
AC_MSG_CHECKING(whether double-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
......@@ -205,6 +213,26 @@ fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${want_single_precision}" = x"yes" ; then
AC_MSG_CHECKING(whether single-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_complex_sse=yes
else
can_compile_sse=no
install_real_sse=no
install_complex_sse=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_WARN([Cannot compile single-precision SSE kernel: disabling SSE kernels alltogether])
fi
fi
dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS
......@@ -522,14 +550,6 @@ if test x"${want_gpu}" = x"yes" ; then
can_compile_gpu=yes
fi
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
dnl now check which kernels can be compiled
dnl the checks for SSE were already done before
......@@ -829,3 +849,7 @@ if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
fi
if test "${can_compile_sse}" = "no" ; then
AC_MSG_WARN([Could not compile SSE instructions])
fi
......@@ -301,12 +301,14 @@ contains
! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......@@ -564,7 +566,6 @@ contains
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
......@@ -641,14 +642,7 @@ contains
stop
endif
! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
......@@ -657,6 +651,16 @@ contains
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
......@@ -943,12 +947,15 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
......@@ -1266,11 +1273,13 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
......
# --------------------------------------------------------------------------------------------------
#
# This file contains the compute intensive kernels for the Householder transformations,
# coded in x86_64 assembler and using SSE2/SSE3 instructions.
#
# It must be assembled with GNU assembler (just "as" on most Linux machines)
#
# Copyright of the original code rests with the authors inside the ELPA
# consortium. The copyright of any additional modifications shall rest
# with their original authors, but shall adhere to the licensing terms
# distributed along with the original code in the file "COPYING".
#
# --------------------------------------------------------------------------------------------------
.globl double_hh_trafo_single_
.globl single_hh_trafo_complex_single_
.text
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.macro hh_trafo_real_single nrows
# When this macro is called, the following registers are set and must not be changed
# %rdi: Address of q
# %rsi: Address of hh
# %rdx: nb
# %rcx: Remaining rows nq
# %r8: ldq in bytes
# %r9: ldh in bytes
# %rax: address of hh at the end of the loops
# The top of the stack must contain the dot product of the two Householder vectors
movq %rdi, %r10 # Copy address of q
movq %rsi, %r11 # Copy address of hh
# x1 = q(1,2)
# x2 = q(2,2)
#
# y1 = q(1,1) + q(1,2)*hh(2,2)
# y2 = q(2,1) + q(2,2)*hh(2,2)
# single precision implementation does not rely on complex packing !
movaps (%r10), %xmm6 # y1 = q(1,1) ; copy content (16 bytes) starting at address %r10 (q(1,1)) into xmm6 (16 bytes = first 4 single precision values)
# y2 = q(2,1)
# y3 = q(3,1)
# y4 = q(4,1)
.if \nrows>=8
movaps 16(%r10), %xmm7 # y5 = q(5,1) ; copy content od address r10+16 = q(5,1) into xmm7 (16 bytes = single precision values 5 and 6, 7, 8)
# y6 = q(6,1)
# y7 = q(7,1)
# y8 = q(8,1)
.if \nrows==12
movaps 32(%r10), %xmm8 # y9 = q(9,1) ; copy content od address r10+32 = q(9,1) into xmm8 (16 bytes = single precision values 9 ,10, 11, 12)
# y10 = q(10,1)
# y11 = q(11,1)
# y12 = q(12,1)
.endif
.endif
addq %r8, %r10 # %r10 => q(.,2) # add to r10 ldq -> r10 now is q(*,2) r10 = r10 + r8
# carefull here ! we want to store in xmm9 four times the value of h(2,2) !
movddup 4(%r11,%r9), %xmm13 # hh(2,2) # copy from starting address r11 ldh bytes into xmm13 (wolud be hh(1,2)) shift by 4 bytes hh(2,2) and duplicate; xmm13 contains h(2,2), h(3,2), h(2,2), h(3,2)
movsldup %xmm13, %xmm9 # copy the first 4 bytes (h(2,2)) and duplicate, the same for the third 4 bytes => xmm9 contains h(2,2), h(2,2), h(2,2), h(2,2)
# movshdup %xmm13, %xmm9
.macro mac_pre_loop1_single qoff, X, Y
movaps \qoff(%r10), \X # xn = q(n,2) # x = r10 + qoff = q(1+qoff,2) ; x contains the values q(1+qoff,2), q(2+qoff,2) , q(3+qoff,2), q(4+qoff,2) (=4 single precision floats)
movaps \X, %xmm10 # copy x into xmm10 = q(1+qoff,2) .. q(4+qoff,2) = 4 single precision floats)
mulps %xmm9, %xmm10 # multiply 4 single precision values xmm9 (four times h(2,2)) with four single precision values q(1+qoff,2)..q(4+qoff,2) stored in xmm10; store result in xmm10
addps %xmm10, \Y # yn = yn + xn*h(2,2) # add the four values in xmm10 (q(1+qoff,2)*h(2,2)..q(4+qoff,2)*h(2,2)) and \Y ; store in Y
.endm
mac_pre_loop1_single 0, %xmm0, %xmm6 # do the step y(1:4) = q(1:4,1) +q(1:4,2)*h(2,2) for the first 4 single precision floats
.if \nrows>=8
mac_pre_loop1_single 16, %xmm1, %xmm7 # for the next 4 floats
.if \nrows==12
mac_pre_loop1_single 32, %xmm2, %xmm8 # for the next 4 floats
.endif
.endif
.purgem mac_pre_loop1_single
# do i=3,nb
# h1 = hh(i-1,1)
# h2 = hh(i,2)
# x1 = x1 + q(1,i)*h1
# y1 = y1 + q(1,i)*h2
# x2 = x2 + q(2,i)*h1
# y2 = y2 + q(2,i)*h2
# ...
# enddo
addq $4, %r11 # r11 points to hh(1,1) + 4 bytes = hh(2,1)
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # advance i %r10 => q(.,i)
# careful here we want xmm11 to contain four times the value of hh(i-1,1)
movddup (%r11), %xmm13 # copy the first 8 bytes at r11 and duplicate ; xmm13 contains hh(i-1,1), hh(i,1), hh(i-1,1), hh(i,1)
movsldup %xmm13, %xmm11 # copy the first 4 bytes (h(i-1,1)) and duplicate, the same for the third 4 bytes => xmm11 contains h(i-1,1), h(i-1,1), h(i-1,1), h(i-1,1)
# movshdup %xmm13, %xmm11
# carefull here we want xmm9 to contain four times the value of hh(i,2)
movddup 4(%r11,%r9), %xmm13 # add to hh(i-1,1) ldh (r9) bytes => hh(i-1,2) add 4 extra bytes => hh(i,2) and duplicate ; xmm13 contains hh(i,2), hh(i+1,2), hh(i,2), hh(i+1,2)
movsldup %xmm13, %xmm9 # copy the first 4 bytes (h(i,2)) and duplicate, the same for the third 4 bytes => xmm9 contains h(i,2), h(i,2), h(i,2), h(i,2)
# movshdup %xmm13, %xmm9
.macro mac_loop1_single qoff, X, Y
movaps \qoff(%r10), %xmm13 # q(.,i) copy q(1,i), q(2,i), q(3,i), q(4,i) into xmm13
movaps %xmm13, %xmm10 # copy q(1,i), q(2,i), q(3,i) and q(4,i) into xmm10
mulps %xmm11, %xmm13 # multiply q(1,i), q(2,i), q(3,i), q(4,i) with hh(i-1,i), h(i-1,1), h(i-1,1), h(i-1,1) ; store in xmm13
addps %xmm13, \X # xn = xn + q(.,i)*h1 ; add to h1*q(.,i) the valye of x store in x
mulps %xmm9, %xmm10 # multiply hh(i,2), h(i,2), h(i,2), h(i,2) with q(1,i), q(2,i), q(3,i), q(4,i) store into xmm10
addps %xmm10, \Y # yn = yn + q(.,i)*h2 ; add q(.,i)*h2 to Y store in y
.endm
mac_loop1_single 0, %xmm0, %xmm6
.if \nrows>=8
mac_loop1_single 16, %xmm1, %xmm7
.if \nrows==12
mac_loop1_single 32, %xmm2, %xmm8
.endif
.endif
.purgem mac_loop1_single
addq $4, %r11
jmp 1b
2:
# x1 = x1 + q(1,nb+1)*hh(nb,1)
# x2 = x2 + q(2,nb+1)*hh(nb,1)
addq %r8, %r10 # %r10 => q(.,nb+1) # add ldq on q +> q(.,nb+1)
# careful here we want xm11 to contain four times the value hh(nb,1)
movddup (%r11), %xmm13 # copy hh(nb,1) hh(nb+1,1) into xmm13 and duplicate
movsldup %xmm13, %xmm11 # copy the first 4 bytes (h(nb,1)) and duplicate, the same for the third 4 bytes => xmm11 contains h(nb,1), h(nb,1), h(nb,1), h(nb,1)
# movshdup %xmm13, %xmm11
.macro mac_post_loop1_single qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1) copy q(1,nb+1), q(2,nb+1) q(3,nb+1), q(4,nb+1) into xmm13
mulps %xmm11, %xmm13 # multiply hh(nb,1) hh(nb,1) hh(nb,1) hh(nb,1) with q(1,nb+1), q(2,nb+1) q(3,nb+1), q(4,nb+1) store in xmm13
addps %xmm13, \X # add hh(nb,1)*q(.,nb+1) and x store in x
.endm
mac_post_loop1_single 0, %xmm0
.if \nrows>=8
mac_post_loop1_single 16, %xmm1
.if \nrows==12
mac_post_loop1_single 32, %xmm2
.endif
.endif
.purgem mac_post_loop1_single
# tau1 = hh(1,1)
# tau2 = hh(1,2)
#
# h1 = -tau1
# x1 = x1*h1
# x2 = x2*h1
movq %rsi, %r11 # restore %r11 (hh(1,1))
# carefull here we want xmm10 to contains for times the value hh(1,1)
movddup (%r11), %xmm13 # copy hh(1,1) hh(2,1) into xmm13 and duplicate
movsldup %xmm13, %xmm10 # copy the first 4 bytes (hh(n1,1)) and duplicate, the same for the third 4 bytes => xmm10 contains h(1,1), h(1,1), h(1,1), h(1,1)
# movshdup %xmm13, %xmm10
xorps %xmm11, %xmm11
subps %xmm10, %xmm11 # %xmm11 = -hh(1,1)
mulps %xmm11, %xmm0
.if \nrows>=8
mulps %xmm11, %xmm1
.if \nrows==12
mulps %xmm11, %xmm2
.endif
.endif
# h1 = -tau2
# h2 = -tau2*s
# y1 = y1*h1 + x1*h2
# y2 = y2*h1 + x2*h2
# careful here we want xmm12 to contain four times hh(1,2)
movddup (%r11,%r9), %xmm13 # xmm13 contains hh(1,2) hh(2,2) and duplicate
movsldup %xmm13, %xmm10 # copy the first 4 bytes (hh(1,2)) and duplicate, the same for the third 4 bytes => xmm10 contains h(1,2), h(1,2), h(1,2), h(1,2)
# movshdup %xmm13, %xmm10
xorps %xmm9, %xmm9
subps %xmm10, %xmm9 # %xmm9 = -hh(1,2) = h1
movaps %xmm9, %xmm11
# careful here we want xmm10 to contain four times the value of s
movddup (%rsp), %xmm13 # Get s from top of stack plus unknown x and duplicate |s | x| s | x
movsldup %xmm13, %xmm10 # copy the first 4 bytes (s) and duplicate, the same for the third 4 bytes => xmm10 contains s,s,s,s
# movshdup %xmm13, %xmm10
mulps %xmm10, %xmm11 # %xmm14 = h2
.macro mac_xform_y_single X, Y
mulps %xmm9, \Y # y1 = y1*h1
movaps \X, %xmm10
mulps %xmm11, %xmm10
addps %xmm10, \Y
.endm
mac_xform_y_single %xmm0, %xmm6
.if \nrows>=8
mac_xform_y_single %xmm1, %xmm7
.if \nrows==12
mac_xform_y_single %xmm2, %xmm8
.endif
.endif
.purgem mac_xform_y_single
# q(1,1) = q(1,1) + y1
# q(2,1) = q(2,1) + y2
movq %rdi, %r10 # restore original Q
.macro mac_pre_loop2_1_single qoff, Y
movaps \qoff(%r10), %xmm13 # q(.,1)
addps \Y, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_1_single 0, %xmm6
.if \nrows>=8
mac_pre_loop2_1_single 16, %xmm7
.if \nrows==12
mac_pre_loop2_1_single 32, %xmm8
.endif
.endif
.purgem mac_pre_loop2_1_single
# q(1,2) = q(1,2) + x1 + y1*hh(2,2)
# q(2,2) = q(2,2) + x2 + y2*hh(2,2)
addq %r8, %r10 # %r10 => q(.,2)
# careful here we want xmm9 to contain 4 times the value of h(2,2)
movddup 4(%r11,%r9), %xmm13 # xmm13 contains hh(2,2) hh(2,3) and duplicate
movsldup %xmm13, %xmm9 # copy the first 4 bytes (hh(2,2)) and duplicate, the same for the third 4 bytes => xmm10 contains h(2,2), h(2,2), h(2,2), h(2,2)
# movshdup %xmm13, %xmm9
.macro mac_pre_loop2_2_single qoff, X, Y
movaps \X, %xmm13
movaps \Y, %xmm10
mulps %xmm9, %xmm10
addps %xmm10, %xmm13
addps \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_pre_loop2_2_single 0, %xmm0, %xmm6
.if \nrows>=8
mac_pre_loop2_2_single 16, %xmm1, %xmm7
.if \nrows==12
mac_pre_loop2_2_single 32, %xmm2, %xmm8
.endif
.endif
.purgem mac_pre_loop2_2_single
# do i=3,nb
# h1 = hh(i-1,1)
# h2 = hh(i,2)
# q(1,i) = q(1,i) + x1*h1 + y1*h2
# q(2,i) = q(2,i) + x2*h1 + y2*h2
# enddo
addq $4, %r11
.align 16
1:
cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax
jge 2f
addq %r8, %r10 # %r10 => q(.,i)
# careful here we want xmm11 to contain 4 times the value of hh(i-1,1)
movddup (%r11), %xmm13 # hh(i-1,1) | hh(i,1) | hh(i-1,1) | hh(i,1)
movsldup %xmm13, %xmm11 # copy the first 4 bytes hh(i-1,1)
# movshdup %xmm13, %xmm11
# careful here we want xmm9 to contain 4 times the value of hh(i,2)
movddup 4(%r11,%r9), %xmm13 # hh(i,2) | hh(i+1,2) and duplicate
movsldup %xmm13, %xmm9 # copy the first 4 bytes hh(i,2)
# movshdup %xmm13, %xmm9
.macro mac_loop2_single qoff, X, Y
movaps \X, %xmm13
mulps %xmm11, %xmm13
movaps \Y, %xmm10
mulps %xmm9, %xmm10
addps %xmm10, %xmm13
addps \qoff(%r10), %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_loop2_single 0, %xmm0, %xmm6
.if \nrows>=8
mac_loop2_single 16, %xmm1, %xmm7
.if \nrows==12
mac_loop2_single 32, %xmm2, %xmm8
.endif
.endif
.purgem mac_loop2_single
addq $4, %r11
jmp 1b
2:
# q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
# q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
addq %r8, %r10 # %r10 => q(.,nb+1)
# carefule here we want xm11 to contain 4 times the value of hh(nb,1)
movddup (%r11), %xmm13 # hh(nb,1) | hh(nb+1,1) and duplicate
movsldup %xmm13, %xmm11 # copy the first 4 bytes hh(nb,1)
# movshdup %xmm13, %xmm11
.macro mac_post_loop2_single qoff, X
movaps \qoff(%r10), %xmm13 # q(.,nb+1)
mulps %xmm11, \X
addps \X, %xmm13
movaps %xmm13, \qoff(%r10)
.endm
mac_post_loop2_single 0, %xmm0
.if \nrows>=8
mac_post_loop2_single 16, %xmm1
.if \nrows==12
mac_post_loop2_single 32, %xmm2
.endif
.endif
.purgem mac_post_loop2_single
.endm
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# FORTRAN Interface:
#
# subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh)
#
# integer, intent(in) :: nb, nq, ldq, ldh
# real*8, intent(inout) :: q(ldq,*)
# real*8, intent(in) :: hh(ldh,*)
#
# Parameter mapping to registers
# parameter 1: %rdi : q
# parameter 2: %rsi : hh
# parameter 3: %rdx : nb
# parameter 4: %rcx : nq
# parameter 5: %r8 : ldq
# parameter 6: %r9 : ldh
#
#-------------------------------------------------------------------------------
.align 16,0x90
double_hh_trafo_single_:
# Get integer parameters into corresponding registers
movslq (%rdx), %rdx # nb
movslq (%rcx), %rcx # nq
movslq (%r8), %r8 # ldq
movslq (%r9), %r9 # ldh
# Get ldq in bytes
addq %r8, %r8
addq %r8, %r8 # 4*ldq, i.e. ldq in bytes
# Get ldh in bytes
addq %r9, %r9
addq %r9, %r9 # 4*ldq, i.e. ldh in bytes
# set %rax to the address of hh at the end of the loops,
# i.e. if %rdx >= %rax we must jump out of the loop.
# please note: %rax = 4*%rdx + %rsi - 4
movq %rdx, %rax
addq %