Commit de6a4fde authored by Andreas Marek's avatar Andreas Marek

Enable single-precision calculations for ELPA1

With the configure option "--enable-single-precision" ELPA1 is build
with single-precision (half-words) only.

The best precision in single-precision (float or complex) is
2^-23 ~ 1.2e-7. The accuracy of the error residual of ELPA1 in
single-precision mode is of the order 1e-4 to 1e-5. The orthogonality of
the EV's is fullfilled up to about ~1e-6.

Thus the precision of ELPA1 in single-precision mode is roughly 100 -
1000 times less than the best achievable precison. This is consistent
with the double-precision mode, where also a factor of 100 - 1000 less
precision than the theoretical best one is found.

The float EVs are identical to the double EVs to at least 1e-2, the
precision of the EVs is thus about 1e-7/1e-2 = 1e5 times lower than the
best theoretical precision. If the same holds for the double precision
calculations, this implies that the double precision results can also
be only trusted on the level 1e-11 (5 orders of magnitude larger
than the best theoretical precision)

The best speed-up compared to the double precision calculation is
a factor of two. This is by far not achieved yet, since the singl
precision version is not at all optimized at the moment
parent c6b56c7e
...@@ -9,7 +9,7 @@ AM_LDFLAGS = $(SCALAPACK_LDFLAGS) ...@@ -9,7 +9,7 @@ AM_LDFLAGS = $(SCALAPACK_LDFLAGS)
lib_LTLIBRARIES = libelpa@SUFFIX@.la lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/elpa_utilities.F90 \ src/elpa_utilities.F90 \
src/elpa1_compute.F90 \ src/elpa1_compute.F90 \
src/elpa1.F90 \ src/elpa1.F90 \
...@@ -310,6 +310,8 @@ elpa2_test_complex_default_kernel.sh: ...@@ -310,6 +310,8 @@ elpa2_test_complex_default_kernel.sh:
elpa2_test_complex_choose_kernel_with_api.sh: elpa2_test_complex_choose_kernel_with_api.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh
chmod +x elpa2_test_complex_choose_kernel_with_api.sh chmod +x elpa2_test_complex_choose_kernel_with_api.sh
mod_precision.i: $(top_srcdir)/src/mod_precision.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/mod_precision.F90 -o $@
elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90 elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
...@@ -320,6 +322,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90 ...@@ -320,6 +322,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90 elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
elpa1_compute.i: $(top_srcdir)/src/elpa1_compute.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1_compute.F90 -o $@
elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
......
...@@ -509,6 +509,14 @@ if test x"${want_gpu}" = x"yes" ; then ...@@ -509,6 +509,14 @@ if test x"${want_gpu}" = x"yes" ; then
can_compile_gpu=yes can_compile_gpu=yes
fi fi
dnl check whether single precision is requested
AC_MSG_CHECKING(whether single precision calculations are requested)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
dnl now check which kernels can be compiled dnl now check which kernels can be compiled
dnl the checks for SSE were already done before dnl the checks for SSE were already done before
...@@ -722,10 +730,15 @@ DX_HTML_FEATURE(ON) ...@@ -722,10 +730,15 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs]) DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0 DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable]) AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi fi
if test x"${want_single_precision}" = x"no" ; then
AC_DEFINE([DOUBLE_PRECISION_REAL],[1],[use double precision for real calculation])
AC_DEFINE([DOUBLE_PRECISION_COMPLEX],[1],[use double precision for complex calculation])
fi
AC_SUBST([WITH_MKL]) AC_SUBST([WITH_MKL])
AC_SUBST([WITH_BLACS]) AC_SUBST([WITH_BLACS])
AC_SUBST([with_amd_bulldozer_kernel]) AC_SUBST([with_amd_bulldozer_kernel])
......
...@@ -88,6 +88,7 @@ module ELPA1 ...@@ -88,6 +88,7 @@ module ELPA1
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#endif #endif
use iso_c_binding
implicit none implicit none
PRIVATE ! By default, all routines contained are private PRIVATE ! By default, all routines contained are private
...@@ -104,9 +105,9 @@ module ELPA1 ...@@ -104,9 +105,9 @@ module ELPA1
! Timing results, set by every call to solve_evp_xxx ! Timing results, set by every call to solve_evp_xxx
real(kind=rk), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form) real(kind=c_double), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form)
real(kind=rk), public :: time_evp_solve !< time for solving the tridiagonal system real(kind=c_double), public :: time_evp_solve !< time for solving the tridiagonal system
real(kind=rk), public :: time_evp_back !< time for back transformations of eigenvectors real(kind=c_double), public :: time_evp_back !< time for back transformations of eigenvectors
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
...@@ -294,6 +295,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp ...@@ -294,6 +295,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#endif #endif
use iso_c_binding
implicit none implicit none
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
...@@ -303,7 +305,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp ...@@ -303,7 +305,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: my_prow, my_pcol, mpierr integer(kind=ik) :: my_prow, my_pcol, mpierr
real(kind=rk), allocatable :: e(:), tau(:) real(kind=rk), allocatable :: e(:), tau(:)
real(kind=rk) :: ttt0, ttt1 real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success logical :: success
logical, save :: firstCall = .true. logical, save :: firstCall = .true.
logical :: wantDebug logical :: wantDebug
...@@ -395,6 +397,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, ...@@ -395,6 +397,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
use timings use timings
#endif #endif
use precision use precision
use iso_c_binding
implicit none implicit none
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
...@@ -407,7 +410,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, ...@@ -407,7 +410,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
integer(kind=ik) :: l_rows, l_cols, l_cols_nev integer(kind=ik) :: l_rows, l_cols, l_cols_nev
real(kind=rk), allocatable :: q_real(:,:), e(:) real(kind=rk), allocatable :: q_real(:,:), e(:)
complex(kind=ck), allocatable :: tau(:) complex(kind=ck), allocatable :: tau(:)
real(kind=rk) :: ttt0, ttt1 real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success logical :: success
logical, save :: firstCall = .true. logical, save :: firstCall = .true.
......
...@@ -89,6 +89,7 @@ module ELPA1_compute ...@@ -89,6 +89,7 @@ module ELPA1_compute
include 'mpif.h' include 'mpif.h'
contains contains
#ifdef DOUBLE_PRECISION_REAL
#define DATATYPE REAL(kind=rk) #define DATATYPE REAL(kind=rk)
#define BYTESIZE 8 #define BYTESIZE 8
...@@ -99,6 +100,19 @@ module ELPA1_compute ...@@ -99,6 +100,19 @@ module ELPA1_compute
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#else
#define DATATYPE REAL(kind=rk)
#define BYTESIZE 4
#define REALCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#endif /* DOUBLE_PRECISION_REAL */
subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!------------------------------------------------------------------------------- !-------------------------------------------------------------------------------
...@@ -239,8 +253,14 @@ module ELPA1_compute ...@@ -239,8 +253,14 @@ module ELPA1_compute
vr(1:l_rows) = a(1:l_rows,l_cols+1) vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then if(nstor>0 .and. l_rows>0) then
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), & #ifdef DOUBLE_PRECISION_REAL
uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1) call DGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1)
#else
call SGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1)
#endif
endif endif
if(my_prow==prow(istep-1, nblk, np_rows)) then if(my_prow==prow(istep-1, nblk, np_rows)) then
...@@ -251,8 +271,11 @@ module ELPA1_compute ...@@ -251,8 +271,11 @@ module ELPA1_compute
aux1(2) = 0. aux1(2) = 0.
endif endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #if DOUBLE_PRECISION_REAL
call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
vnorm2 = aux2(1) vnorm2 = aux2(1)
vrl = aux2(2) vrl = aux2(2)
...@@ -274,7 +297,11 @@ module ELPA1_compute ...@@ -274,7 +297,11 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns ! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) #ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
tau(istep) = vr(l_rows+1) tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc ! Transpose Householder vector vr -> vc
...@@ -319,15 +346,33 @@ module ELPA1_compute ...@@ -319,15 +346,33 @@ module ELPA1_compute
if (lre<lrs) cycle if (lre<lrs) cycle
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then if (mod(n_iter,n_threads) == my_thread) then
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1) #ifdef DOUBLE_PRECISION_REAL
if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1) call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc_p(lcs,my_thread), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur_p(lrs,my_thread), 1)
endif
#else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc_p(lcs,my_thread), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur_p(lrs,my_thread), 1)
endif
#endif
endif endif
n_iter = n_iter+1 n_iter = n_iter+1
#else #else /* WITH_OPENMP */
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc(lcs), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur(lrs), 1)
endif
#else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc(lcs), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur(lrs), 1)
endif
#endif #endif
#endif /* WITH_OPENMP */
enddo enddo
enddo enddo
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -342,8 +387,13 @@ module ELPA1_compute ...@@ -342,8 +387,13 @@ module ELPA1_compute
enddo enddo
#endif #endif
if (nstor>0) then if (nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1) #ifdef DOUBLE_PRECISION_REAL
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1) call DGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1)
call DGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1)
#else
call SGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1)
call SGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1)
#endif
endif endif
endif endif
...@@ -363,7 +413,12 @@ module ELPA1_compute ...@@ -363,7 +413,12 @@ module ELPA1_compute
if (l_cols>0) then if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols) tmp(1:l_cols) = uc(1:l_cols)
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
endif endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
...@@ -374,8 +429,11 @@ module ELPA1_compute ...@@ -374,8 +429,11 @@ module ELPA1_compute
x = 0 x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) #ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
#else
call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
! store u and v in the matrices U and V ! store u and v in the matrices U and V
! these matrices are stored combined in one here ! these matrices are stored combined in one here
...@@ -400,9 +458,16 @@ module ELPA1_compute ...@@ -400,9 +458,16 @@ module ELPA1_compute
lrs = 1 lrs = 1
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle if (lce<lcs .or. lre<lrs) cycle
call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, & #ifdef DOUBLE_PRECISION_REAL
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), & call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk, &
1.d0,a(lrs,lcs),lda) vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk, a(lrs,lcs), lda)
#else
call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk, a(lrs,lcs), lda)
#endif
enddo enddo
nstor = 0 nstor = 0
...@@ -427,14 +492,25 @@ module ELPA1_compute ...@@ -427,14 +492,25 @@ module ELPA1_compute
! distribute the arrays d and e to all processors ! distribute the arrays d and e to all processors
allocate(tmp(na)) allocate(tmp(na))
#ifdef DOUBLE_PRECISION_REAL
tmp = d
call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e
call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e
call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
#else
tmp = d tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d
call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
deallocate(tmp) deallocate(tmp)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_real") call timer%stop("tridiag_real")
...@@ -560,8 +636,11 @@ module ELPA1_compute ...@@ -560,8 +636,11 @@ module ELPA1_compute
enddo enddo
if (nb>0) & if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr) #ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr)
#endif
nb = 0 nb = 0
do ic=ics,ice do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
...@@ -578,7 +657,12 @@ module ELPA1_compute ...@@ -578,7 +657,12 @@ module ELPA1_compute
tmat = 0 tmat = 0
if (l_rows>0) & if (l_rows>0) &
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows) #ifdef DOUBLE_PRECISION_REAL
call dsyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows)
#else
call ssyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows)
#endif
nc = 0 nc = 0
do n=1,nstor-1 do n=1,nstor-1
...@@ -586,14 +670,22 @@ module ELPA1_compute ...@@ -586,14 +670,22 @@ module ELPA1_compute
nc = nc+n nc = nc+n
enddo enddo
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #ifdef DOUBLE_PRECISION_REAL
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
! Calculate triangular matrix T ! Calculate triangular matrix T
nc = 0 nc = 0
tmat(1,1) = tau(ice-nstor+1) tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1 do n=1,nstor-1
call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1) #ifdef DOUBLE_PRECISION_REAL
call dtrmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
#else
call strmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
#endif
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n nc = nc+n
...@@ -602,17 +694,32 @@ module ELPA1_compute ...@@ -602,17 +694,32 @@ module ELPA1_compute
! Q = Q - V * T * V**T * Q ! Q = Q - V * T * V**T * Q
if (l_rows>0) then if (l_rows>0) then
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & #ifdef DOUBLE_PRECISION_REAL
q,ldq,0.d0,tmp1,nstor) call dgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), &
q, ldq, 0.0_rk, tmp1, nstor)
#else
call sgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), &
q, ldq, 0.0_rk, tmp1, nstor)
#endif
else else
tmp1(1:l_cols*nstor) = 0 tmp1(1:l_cols*nstor) = 0
endif endif
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
if (l_rows>0) then if (l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor) call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), & call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2,nstor,1.d0,q,ldq) tmp2, nstor, 1.0_rk, q, ldq)
endif endif
#else
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
if (l_rows>0) then
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk, q, ldq)
endif
#endif
nstor = 0 nstor = 0
endif endif
...@@ -809,8 +916,11 @@ module ELPA1_compute ...@@ -809,8 +916,11 @@ module ELPA1_compute
enddo enddo
! Broadcast block column ! Broadcast block column
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr) call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr)
#endif
! Insert what we got in aux_mat ! Insert what we got in aux_mat
...@@ -844,15 +954,24 @@ module ELPA1_compute ...@@ -844,15 +954,24 @@ module ELPA1_compute
if (lcs<=lce) then if (lcs<=lce) then
allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce)) allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
if (lrs<=lre) then if (lrs<=lre) then
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), & #ifdef DOUBLE_PRECISION_REAL
b(lrs,lcs),ldb,0.d0,tmp1,nstor) call dgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), &
b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor)
#else
call sgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), &
b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor)
#endif
else else
tmp1 = 0 tmp1 = 0
endif endif