Commit de6a4fde authored by Andreas Marek's avatar Andreas Marek

Enable single-precision calculations for ELPA1

With the configure option "--enable-single-precision" ELPA1 is build
with single-precision (half-words) only.

The best precision in single-precision (float or complex) is
2^-23 ~ 1.2e-7. The accuracy of the error residual of ELPA1 in
single-precision mode is of the order 1e-4 to 1e-5. The orthogonality of
the EV's is fullfilled up to about ~1e-6.

Thus the precision of ELPA1 in single-precision mode is roughly 100 -
1000 times less than the best achievable precison. This is consistent
with the double-precision mode, where also a factor of 100 - 1000 less
precision than the theoretical best one is found.

The float EVs are identical to the double EVs to at least 1e-2, the
precision of the EVs is thus about 1e-7/1e-2 = 1e5 times lower than the
best theoretical precision. If the same holds for the double precision
calculations, this implies that the double precision results can also
be only trusted on the level 1e-11 (5 orders of magnitude larger
than the best theoretical precision)

The best speed-up compared to the double precision calculation is
a factor of two. This is by far not achieved yet, since the singl
precision version is not at all optimized at the moment
parent c6b56c7e
......@@ -9,7 +9,7 @@ AM_LDFLAGS = $(SCALAPACK_LDFLAGS)
lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -310,6 +310,8 @@ elpa2_test_complex_default_kernel.sh:
elpa2_test_complex_choose_kernel_with_api.sh:
echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh
chmod +x elpa2_test_complex_choose_kernel_with_api.sh
mod_precision.i: $(top_srcdir)/src/mod_precision.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/mod_precision.F90 -o $@
elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
......@@ -320,6 +322,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
elpa1_compute.i: $(top_srcdir)/src/elpa1_compute.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1_compute.F90 -o $@
elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
......
......@@ -509,6 +509,14 @@ if test x"${want_gpu}" = x"yes" ; then
can_compile_gpu=yes
fi
dnl check whether single precision is requested
AC_MSG_CHECKING(whether single precision calculations are requested)
AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision],
[build with single precision])],
want_single_precision="yes", want_single_precision="no")
AC_MSG_RESULT([${want_single_precision}])
dnl now check which kernels can be compiled
dnl the checks for SSE were already done before
......@@ -722,10 +730,15 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
if test x"${want_single_precision}" = x"no" ; then
AC_DEFINE([DOUBLE_PRECISION_REAL],[1],[use double precision for real calculation])
AC_DEFINE([DOUBLE_PRECISION_COMPLEX],[1],[use double precision for complex calculation])
fi
AC_SUBST([WITH_MKL])
AC_SUBST([WITH_BLACS])
AC_SUBST([with_amd_bulldozer_kernel])
......
......@@ -88,6 +88,7 @@ module ELPA1
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use iso_c_binding
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -104,9 +105,9 @@ module ELPA1
! Timing results, set by every call to solve_evp_xxx
real(kind=rk), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form)
real(kind=rk), public :: time_evp_solve !< time for solving the tridiagonal system
real(kind=rk), public :: time_evp_back !< time for back transformations of eigenvectors
real(kind=c_double), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form)
real(kind=c_double), public :: time_evp_solve !< time for solving the tridiagonal system
real(kind=c_double), public :: time_evp_back !< time for back transformations of eigenvectors
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
......@@ -294,6 +295,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use iso_c_binding
implicit none
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
......@@ -303,7 +305,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: my_prow, my_pcol, mpierr
real(kind=rk), allocatable :: e(:), tau(:)
real(kind=rk) :: ttt0, ttt1
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success
logical, save :: firstCall = .true.
logical :: wantDebug
......@@ -395,6 +397,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
use timings
#endif
use precision
use iso_c_binding
implicit none
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
......@@ -407,7 +410,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
integer(kind=ik) :: l_rows, l_cols, l_cols_nev
real(kind=rk), allocatable :: q_real(:,:), e(:)
complex(kind=ck), allocatable :: tau(:)
real(kind=rk) :: ttt0, ttt1
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success
logical, save :: firstCall = .true.
......
......@@ -89,6 +89,7 @@ module ELPA1_compute
include 'mpif.h'
contains
#ifdef DOUBLE_PRECISION_REAL
#define DATATYPE REAL(kind=rk)
#define BYTESIZE 8
......@@ -99,6 +100,19 @@ module ELPA1_compute
#undef BYTESIZE
#undef REALCASE
#else
#define DATATYPE REAL(kind=rk)
#define BYTESIZE 4
#define REALCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#endif /* DOUBLE_PRECISION_REAL */
subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
......@@ -239,8 +253,14 @@ module ELPA1_compute
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), &
uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1)
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1)
#else
call SGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1)
#endif
endif
if(my_prow==prow(istep-1, nblk, np_rows)) then
......@@ -251,8 +271,11 @@ module ELPA1_compute
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#if DOUBLE_PRECISION_REAL
call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -274,7 +297,11 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -319,15 +346,33 @@ module ELPA1_compute
if (lre<lrs) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc_p(lcs,my_thread), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur_p(lrs,my_thread), 1)
endif
#else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc_p(lcs,my_thread), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur_p(lrs,my_thread), 1)
endif
#endif
endif
n_iter = n_iter+1
#else
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc(lcs), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur(lrs), 1)
endif
#else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk, uc(lcs), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk, ur(lrs), 1)
endif
#endif
#endif /* WITH_OPENMP */
enddo
enddo
#ifdef WITH_OPENMP
......@@ -342,8 +387,13 @@ module ELPA1_compute
enddo
#endif
if (nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1)
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1)
call DGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1)
#else
call SGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1)
call SGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1)
#endif
endif
endif
......@@ -363,7 +413,12 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
......@@ -374,8 +429,11 @@ module ELPA1_compute
x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
#else
call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -400,9 +458,16 @@ module ELPA1_compute
lrs = 1
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle
call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
1.d0,a(lrs,lcs),lda)
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk, a(lrs,lcs), lda)
#else
call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk, a(lrs,lcs), lda)
#endif
enddo
nstor = 0
......@@ -427,14 +492,25 @@ module ELPA1_compute
! distribute the arrays d and e to all processors
allocate(tmp(na))
#ifdef DOUBLE_PRECISION_REAL
tmp = d
call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e
call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e
call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
#else
tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d
call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
deallocate(tmp)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_real")
......@@ -560,8 +636,11 @@ module ELPA1_compute
enddo
if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr)
#endif
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -578,7 +657,12 @@ module ELPA1_compute
tmat = 0
if (l_rows>0) &
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows)
#ifdef DOUBLE_PRECISION_REAL
call dsyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows)
#else
call ssyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows)
#endif
nc = 0
do n=1,nstor-1
......@@ -586,14 +670,22 @@ module ELPA1_compute
nc = nc+n
enddo
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
! Calculate triangular matrix T
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1
call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1)
#ifdef DOUBLE_PRECISION_REAL
call dtrmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
#else
call strmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
#endif
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
......@@ -602,17 +694,32 @@ module ELPA1_compute
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
q,ldq,0.d0,tmp1,nstor)
#ifdef DOUBLE_PRECISION_REAL
call dgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), &
q, ldq, 0.0_rk, tmp1, nstor)
#else
call sgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), &
q, ldq, 0.0_rk, tmp1, nstor)
#endif
else
tmp1(1:l_cols*nstor) = 0
endif
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
if (l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
tmp2,nstor,1.d0,q,ldq)
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk, q, ldq)
endif
#else
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
if (l_rows>0) then
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk, q, ldq)
endif
#endif
nstor = 0
endif
......@@ -809,8 +916,11 @@ module ELPA1_compute
enddo
! Broadcast block column
call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr)
#endif
! Insert what we got in aux_mat
......@@ -844,15 +954,24 @@ module ELPA1_compute
if (lcs<=lce) then
allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
if (lrs<=lre) then
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), &
b(lrs,lcs),ldb,0.d0,tmp1,nstor)
#ifdef DOUBLE_PRECISION_REAL
call dgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), &
b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor)
#else
call sgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), &
b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor)
#endif
else
tmp1 = 0
endif
! Sum up the results and send to processor row np
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL8, MPI_SUM, np, mpi_comm_rows, mpierr)
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -873,6 +992,8 @@ module ELPA1_compute
end subroutine mult_at_b_real
#ifdef DOUBLE_PRECISION_COMPLEX
#define DATATYPE COMPLEX(kind=ck)
#define BYTESIZE 16
#define COMPLEXCASE 1
......@@ -882,6 +1003,18 @@ module ELPA1_compute
#undef BYTESIZE
#undef COMPLEXCASE
#else /* DOUBLE_PRECISION_COMPLEX */
#define DATATYPE COMPLEX(kind=ck)
#define BYTESIZE 8
#define COMPLEXCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#endif /* DOUBLE_PRECISION_COMPLEX */
subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
......@@ -930,7 +1063,7 @@ module ELPA1_compute
integer(kind=ik), parameter :: max_stored_rows = 32
complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
complex(kind=ck), parameter :: CZERO = (0.0_rk,0.0_rk), CONE = (1.0_rk,0.0_rk)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
......@@ -1028,8 +1161,13 @@ module ELPA1_compute
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if (nstor>0 .and. l_rows>0) then
aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor))
call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), &
aux,1,CONE,vr,1)
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), &
aux, 1, CONE, vr, 1)
#else
call CGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), &
aux, 1, CONE, vr, 1)
#endif
endif
if (my_prow==prow(istep-1, nblk, np_rows)) then
......@@ -1039,9 +1177,11 @@ module ELPA1_compute
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -1063,7 +1203,11 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(vr, l_rows+1, MPI_DOUBLE_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, l_rows+1, MPI_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -1111,14 +1255,34 @@ module ELPA1_compute
if (lre<lrs) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc_p(lcs,my_thread),1)
if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur_p(lrs,my_thread),1)
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('C', lre-lrs+1 ,lce-lcs+1, CONE, a(lrs,lcs), lda, vr(lrs), 1, CONE, uc_p(lcs,my_thread), 1)
if (i/=j) then
call ZGEMV('N', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vc(lcs), 1, CONE, ur_p(lrs,my_thread), 1)
endif
#else
call CGEMV('C', lre-lrs+1 ,lce-lcs+1, CONE, a(lrs,lcs), lda, vr(lrs), 1, CONE, uc_p(lcs,my_thread), 1)
if (i/=j) then
call CGEMV('N', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vc(lcs), 1, CONE, ur_p(lrs,my_thread), 1)
endif
#endif
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('C', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vr(lrs), 1, CONE, uc(lcs), 1)
if (i/=j) then
call ZGEMV('N', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vc(lcs), 1, CONE, ur(lrs), 1)
endif
#else
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc(lcs),1)
if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur(lrs),1)
call CGEMV('C', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vr(lrs), 1, CONE, uc(lcs), 1)
if (i/=j) then
call CGEMV('N', lre-lrs+1, lce-lcs+1, CONE, a(lrs,lcs), lda, vc(lcs), 1, CONE, ur(lrs), 1)
endif
#endif
#endif /* WITH_OPENMP */
enddo
enddo
......@@ -1135,8 +1299,13 @@ module ELPA1_compute
#endif
if (nstor>0) then
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1)
call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1)
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1)
call ZGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1)
#else
call CGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1)
call CGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1)
#endif
endif
endif
......@@ -1156,7 +1325,12 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(tmp, uc, l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp, uc, l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
endif
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
......@@ -1173,7 +1347,11 @@ module ELPA1_compute
xc