Single precision assembly kernel for ELPA2 real and complex

The single precision version of the SSE assembly kernel is about 1.8
times faster than the double precision version
parent c66afc1c
...@@ -81,10 +81,17 @@ if WITH_REAL_BGQ_KERNEL ...@@ -81,10 +81,17 @@ if WITH_REAL_BGQ_KERNEL
endif endif
if WITH_REAL_SSE_KERNEL if WITH_REAL_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
else else
if WITH_COMPLEX_SSE_KERNEL if WITH_COMPLEX_SSE_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif
endif endif
endif endif
...@@ -332,6 +339,7 @@ elpa2_test_real_single_precision@SUFFIX@_LDADD = $(build_lib) ...@@ -332,6 +339,7 @@ elpa2_test_real_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_LDADD = $(build_lib)
...@@ -349,6 +357,7 @@ elpa2_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib) ...@@ -349,6 +357,7 @@ elpa2_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib)
#elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDFLAGS = -static
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources)
elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib)
......
...@@ -101,6 +101,14 @@ AM_PROG_AR ...@@ -101,6 +101,14 @@ AM_PROG_AR
AM_PROG_AS AM_PROG_AS
# Fortran # Fortran
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
AC_LANG([Fortran]) AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4]) m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no]) AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
...@@ -190,9 +198,9 @@ if test x"${with_ftimings}" = x"yes"; then ...@@ -190,9 +198,9 @@ if test x"${with_ftimings}" = x"yes"; then
fi fi
AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"]) AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
AC_MSG_CHECKING(whether SSE assembler kernel can be compiled) AC_MSG_CHECKING(whether double-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then if test "$?" == 0; then
can_compile_sse=yes can_compile_sse=yes
install_real_sse=yes install_real_sse=yes
...@@ -205,6 +213,26 @@ fi ...@@ -205,6 +213,26 @@ fi
rm -f ./test.o rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}]) AC_MSG_RESULT([${can_compile_sse}])
if test x"${want_single_precision}" = x"yes" ; then
AC_MSG_CHECKING(whether single-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_complex_sse=yes
else
can_compile_sse=no
install_real_sse=no
install_complex_sse=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_WARN([Cannot compile single-precision SSE kernel: disabling SSE kernels alltogether])
fi
fi
dnl check whether one can compile with avx - gcc intrinsics dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS dnl first pass: try with specified CFLAGS and CXXFLAGS
...@@ -522,14 +550,6 @@ if test x"${want_gpu}" = x"yes" ; then ...@@ -522,14 +550,6 @@ if test x"${want_gpu}" = x"yes" ; then
can_compile_gpu=yes can_compile_gpu=yes
fi fi
dnl check whether single precision is requested
AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
dnl now check which kernels can be compiled dnl now check which kernels can be compiled
dnl the checks for SSE were already done before dnl the checks for SSE were already done before
...@@ -829,3 +849,7 @@ if test "${can_compile_avx2}" = "no" ; then ...@@ -829,3 +849,7 @@ if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Could not compile AVX2 instructions]) AC_MSG_WARN([Could not compile AVX2 instructions])
fi fi
fi fi
if test "${can_compile_sse}" = "no" ; then
AC_MSG_WARN([Could not compile SSE instructions])
fi
...@@ -301,12 +301,14 @@ contains ...@@ -301,12 +301,14 @@ contains
! some temporarilly checks until single precision works with all kernels ! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL #ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. & (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
print *,"At the moment single precision only works with the generic kernels" (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
stop else
endif print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif #endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
...@@ -564,7 +566,6 @@ contains ...@@ -564,7 +566,6 @@ contains
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false. wantDebug = .false.
if (firstCall) then if (firstCall) then
! are debug messages desired? ! are debug messages desired?
...@@ -641,14 +642,7 @@ contains ...@@ -641,14 +642,7 @@ contains
stop stop
endif endif
! some temporarilly checks until single precision works with all kernels ! some temporarilly checks until single precision works with all kernels
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) ) then
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! set the neccessary parameters ! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice() cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost() cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
...@@ -657,6 +651,16 @@ contains ...@@ -657,6 +651,16 @@ contains
cudaHostRegisterMapped = cuda_hostRegisterMapped() cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif endif
#ifndef DOUBLE_PRECISION_REAL
if ( (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal. ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32! ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
...@@ -943,12 +947,15 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -943,12 +947,15 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
endif endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif endif
#ifndef DOUBLE_PRECISION_COMPLEX #ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
print *,"At the moment single precision only works with the generic kernels" (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
stop else
endif print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif #endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
...@@ -1266,11 +1273,13 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -1266,11 +1273,13 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif endif
#ifndef DOUBLE_PRECISION_COMPLEX #ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC) .or. & if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .ne. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) ) then (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
print *,"At the moment single precision only works with the generic kernels" (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
stop else
endif print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif #endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
......
This diff is collapsed.
...@@ -770,8 +770,8 @@ module real_generic_kernel ...@@ -770,8 +770,8 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, & real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12 y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
#endif #endif
real(kind=rk8) :: h1, h2, tau1, tau2 real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_12_generic_single") call timer%start("kernel generic: hh_trafo_kernel_12_generic_single")
...@@ -984,8 +984,8 @@ module real_generic_kernel ...@@ -984,8 +984,8 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, & real(kind=rk4) :: x1, x2, x3, x4, x5, x6, x7, x8, &
y1, y2, y3, y4, y5, y6, y7, y8 y1, y2, y3, y4, y5, y6, y7, y8
#endif #endif
real(kind=rk8) :: h1, h2, tau1, tau2 real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_8_generic_single") call timer%start("kernel generic: hh_trafo_kernel_8_generic_single")
...@@ -1153,7 +1153,7 @@ module real_generic_kernel ...@@ -1153,7 +1153,7 @@ module real_generic_kernel
real(kind=rk4) :: x1, x2, x3, x4, y1, y2, y3, y4 real(kind=rk4) :: x1, x2, x3, x4, y1, y2, y3, y4
#endif #endif
real(kind=rk4) :: h1, h2, tau1, tau2 real(kind=rk4) :: h1, h2, tau1, tau2
integer(kind=ik) :: i integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic: hh_trafo_kernel_4_generic_single") call timer%start("kernel generic: hh_trafo_kernel_4_generic_single")
......
...@@ -492,13 +492,11 @@ module compute_hh_trafo_complex ...@@ -492,13 +492,11 @@ module compute_hh_trafo_complex
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 1, -1 do j = ncols, 1, -1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
print *,"at the moment no sse single-precision kernel" call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
stop bcast_buffer(1,j+off),nbw,nl,stripe_width)
! call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
! bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else #else
! call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), & call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), &
! bcast_buffer(1,j+off),nbw,nl,stripe_width) bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif #endif
enddo enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
......
...@@ -708,14 +708,12 @@ module compute_hh_trafo_real ...@@ -708,14 +708,12 @@ module compute_hh_trafo_real
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
print *,"at the moment no single-precision sse kernel"
stop
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! call double_hh_trafo_single(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, & call double_hh_trafo_single(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
! stripe_width, nbw) stripe_width, nbw)
#else #else
! call double_hh_trafo_single(a(1,j+off+a_off-1,istripe), w, nbw, nl, & call double_hh_trafo_single(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
! stripe_width, nbw) stripe_width, nbw)
#endif #endif
enddo enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......
...@@ -94,7 +94,7 @@ module mod_read_input_parameters ...@@ -94,7 +94,7 @@ module mod_read_input_parameters
read(arg3, *) nblk read(arg3, *) nblk
if (arg4 .eq. "output_eigenvalues") then if (arg4 .eq. "output_eigenvalues") then
write_to_file%eigenvectors = .true. write_to_file%eigenvalues = .true.
else else
write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted" write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
stop 1 stop 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment