Single precision assembly kernel for ELPA2 real and complex

The single precision version of the SSE assembly kernel is about 1.8
times faster than the double precision version
parent c66afc1c
......@@ -81,10 +81,17 @@ if WITH_REAL_BGQ_KERNEL
endif
if WITH_REAL_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
else
if WITH_COMPLEX_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
endif
endif
......@@ -332,6 +339,7 @@ elpa2_test_real_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_LDADD = $(build_lib)
......@@ -349,6 +357,7 @@ elpa2_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib)
......
......@@ -101,6 +101,14 @@ AM_PROG_AR
AM_PROG_AS
# Fortran
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
......@@ -190,9 +198,9 @@ if test x"${with_ftimings}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
AC_MSG_CHECKING(whether SSE assembler kernel can be compiled)
AC_MSG_CHECKING(whether double-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
......@@ -205,6 +213,26 @@ fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${want_single_precision}" = x"yes" ; then
AC_MSG_CHECKING(whether single-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_complex_sse=yes
else
can_compile_sse=no
install_real_sse=no
install_complex_sse=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_WARN([Cannot compile single-precision SSE kernel: disabling SSE kernels alltogether])
fi
fi
dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS
......@@ -522,14 +550,6 @@ if test x"${want_gpu}" = x"yes" ; then
can_compile_gpu=yes
fi
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
dnl now check which kernels can be compiled
dnl the checks for SSE were already done before
......@@ -829,3 +849,7 @@ if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
fi
if test "${can_compile_sse}" = "no" ; then
AC_MSG_WARN([Could not compile SSE instructions])
fi
......@@ -301,12 +301,14 @@ contains
! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......@@ -564,7 +566,6 @@ contains
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
......@@ -641,14 +642,7 @@ contains
stop
endif
! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
......@@ -657,6 +651,16 @@ contains
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
......@@ -943,12 +947,15 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
......@@ -1266,11 +1273,13 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
......
This diff is collapsed.
......@@ -770,8 +770,8 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
#endif
real(kind=rk8) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_12_generic_single")
......@@ -984,8 +984,8 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, &
y1, y2, y3, y4, y5, y6, y7, y8
#endif
real(kind=rk8) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_8_generic_single")
......@@ -1153,7 +1153,7 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, y1, y2, y3, y4
#endif
real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_4_generic_single")
......
......@@ -492,13 +492,11 @@ module compute_hh_trafo_complex
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
print *,"at the moment no sse single-precision kernel"
stop
! call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
! bcast_buffer(1,j+off),nbw,nl,stripe_width)
call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
! call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), &
! bcast_buffer(1,j+off),nbw,nl,stripe_width)
call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
......
......@@ -708,14 +708,12 @@ module compute_hh_trafo_real
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
print *,"at the moment no single-precision sse kernel"
stop
#ifdef WITH_OPENMP
! call double_hh_trafo_single(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
! stripe_width, nbw)
call double_hh_trafo_single(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
stripe_width, nbw)
#else
! call double_hh_trafo_single(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
! stripe_width, nbw)
call double_hh_trafo_single(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
stripe_width, nbw)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......
......@@ -94,7 +94,7 @@ module mod_read_input_parameters
read(arg3, *) nblk
if (arg4 .eq. "output_eigenvalues") then
write_to_file%eigenvectors = .true.
write_to_file%eigenvalues = .true.
else
write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
stop 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment