Optional build of ELPA without MPI

The configure flag "--enable-shared-memory-only" triggers a build
of ELPA without MPI support:

- all MPI calls are skipped (or overloaded)
- all calls to scalapack functions are replaced by the corresponding
  lapack calls
- all calls to blacs are skipped

Using ELPA without MPI gives the same results as using ELPA with 1 MPI
task!

This version is not yet optimized for performance, here and there some
unecessary copies are done.

Ths version is intended for users, who do not have MPI in their
application but still would like to use ELPA on one compute node
parent 29d84527
...@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la ...@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \ src/elpa_utilities.F90 \
src/elpa1_compute.F90 \ src/elpa1_compute.F90 \
src/elpa1.F90 \ src/elpa1.F90 \
...@@ -22,9 +24,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ ...@@ -22,9 +24,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/elpa2_compute.F90 \ src/elpa2_compute.F90 \
src/elpa2.F90 \ src/elpa2.F90 \
src/elpa_c_interface.F90 \ src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \ src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \ src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \ src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90 src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \ libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
...@@ -38,6 +40,13 @@ if HAVE_DETAILED_TIMINGS ...@@ -38,6 +40,13 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c src/ftimings/papi.c
endif endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90 libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif endif
......
...@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then ...@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading]) AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available; dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory dnl if not abort since it is mandatory
# C # C
AC_LANG([C]) AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4]) AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
if test x"${enable_openmp}" = x"yes"; then if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS]) AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS" CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi fi
AC_PROG_INSTALL AC_PROG_INSTALL
AM_PROG_AR AM_PROG_AR
AM_PROG_AS AM_PROG_AS
# Fortran # Fortran
AC_LANG([Fortran]) AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4]) m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])]) AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS]) AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS" FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi fi
# C++ # C++
...@@ -106,11 +117,11 @@ AC_LANG([C++]) ...@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS]) AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS" CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi fi
...@@ -386,35 +397,37 @@ else ...@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path]) AC_MSG_ERROR([could not link with lapack: specify path])
fi fi
dnl test whether scalapack already contains blacs if test x"${enable_shared_memory_only}" = x"no"; then
scalapack_libs="mpiscalapack scalapack" dnl test whether scalapack already contains blacs
old_LIBS="$LIBS" scalapack_libs="mpiscalapack scalapack"
for lib in ${scalapack_libs}; do old_LIBS="$LIBS"
LIBS="-l${lib} ${old_LIBS}" for lib in ${scalapack_libs}; do
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation]) LIBS="-l${lib} ${old_LIBS}"
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no]) AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_MSG_RESULT([${blacs_in_scalapack}]) AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
if test x"${blacs_in_scalapack}" = x"yes"; then AC_MSG_RESULT([${blacs_in_scalapack}])
break if test x"${blacs_in_scalapack}" = x"yes"; then
fi break
done fi
done
if test x"${blacs_in_scalapack}" = x"no"; then if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}" LIBS="${old_LIBS}"
dnl Test for stand-alone blacs dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs]) AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no]) AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure]) AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no]) AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path]) AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi fi
dnl check whether we can link alltogehter dnl check whether we can link alltogehter
...@@ -655,7 +668,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then ...@@ -655,7 +668,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi fi
if test x"${use_specific_real_kernel}" = x"no" ; then if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi fi
LT_INIT LT_INIT
...@@ -667,7 +680,7 @@ DX_HTML_FEATURE(ON) ...@@ -667,7 +680,7 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs]) DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0 DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable]) AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi fi
......
...@@ -86,8 +86,10 @@ module ELPA1 ...@@ -86,8 +86,10 @@ module ELPA1
use elpa1_compute use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#endif #endif
use elpa_mpi
implicit none implicit none
PRIVATE ! By default, all routines contained are private PRIVATE ! By default, all routines contained are private
...@@ -110,7 +112,6 @@ module ELPA1 ...@@ -110,7 +112,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators" !> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail !> \detail
...@@ -328,6 +329,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp ...@@ -328,6 +329,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0 if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0 time_evp_fwd = ttt1-ttt0
......
...@@ -57,6 +57,7 @@ module ELPA1_compute ...@@ -57,6 +57,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#endif #endif
use elpa_mpi
implicit none implicit none
PRIVATE ! set default to private PRIVATE ! set default to private
...@@ -86,8 +87,6 @@ module ELPA1_compute ...@@ -86,8 +87,6 @@ module ELPA1_compute
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
include 'mpif.h'
contains contains
#define DATATYPE REAL(kind=rk) #define DATATYPE REAL(kind=rk)
...@@ -174,7 +173,6 @@ module ELPA1_compute ...@@ -174,7 +173,6 @@ module ELPA1_compute
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Matrix is split into tiles; work is done only for tiles on the diagonal or above ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
...@@ -296,7 +294,11 @@ module ELPA1_compute ...@@ -296,7 +294,11 @@ module ELPA1_compute
aux1(2) = 0. aux1(2) = 0.
endif endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1) vnorm2 = aux2(1)
vrl = aux2(2) vrl = aux2(2)
...@@ -319,7 +321,9 @@ module ELPA1_compute ...@@ -319,7 +321,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns ! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1) tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc ! Transpose Householder vector vr -> vc
...@@ -408,7 +412,11 @@ module ELPA1_compute ...@@ -408,7 +412,11 @@ module ELPA1_compute
if (l_cols>0) then if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols) tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
...@@ -419,8 +427,11 @@ module ELPA1_compute ...@@ -419,8 +427,11 @@ module ELPA1_compute
x = 0 x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = x
#endif
! store u and v in the matrices U and V ! store u and v in the matrices U and V
! these matrices are stored combined in one here ! these matrices are stored combined in one here
...@@ -481,7 +492,7 @@ module ELPA1_compute ...@@ -481,7 +492,7 @@ module ELPA1_compute
print *,"tridiag_real: error when allocating tmp "//errorMessage print *,"tridiag_real: error when allocating tmp "//errorMessage
stop stop
endif endif
#ifdef WITH_MPI
tmp = d tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = d tmp = d
...@@ -490,6 +501,7 @@ module ELPA1_compute ...@@ -490,6 +501,7 @@ module ELPA1_compute
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = e tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#endif
deallocate(tmp, stat=istat, errmsg=errorMessage) deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage print *,"tridiag_real: error when deallocating tmp "//errorMessage
...@@ -570,7 +582,6 @@ module ELPA1_compute ...@@ -570,7 +582,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1 totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1 max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
...@@ -654,9 +665,10 @@ module ELPA1_compute ...@@ -654,9 +665,10 @@ module ELPA1_compute
nb = nb+l_rows nb = nb+l_rows
enddo enddo
#ifdef WITH_MPI
if (nb>0) & if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr) call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
#endif
nb = 0 nb = 0
do ic=ics,ice do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
...@@ -680,9 +692,11 @@ module ELPA1_compute ...@@ -680,9 +692,11 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1) h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n nc = nc+n
enddo enddo
#ifdef WITH_MPI
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
if (nc>0) h2 = h1
#endif
! Calculate triangular matrix T ! Calculate triangular matrix T
nc = 0 nc = 0
...@@ -702,7 +716,11 @@ module ELPA1_compute ...@@ -702,7 +716,11 @@ module ELPA1_compute
else else
tmp1(1:l_cols*nstor) = 0 tmp1(1:l_cols*nstor) = 0
endif endif
#ifdef WITH_MPI
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then if (l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor) call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), & call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
...@@ -800,6 +818,7 @@ module ELPA1_compute ...@@ -800,6 +818,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mult_at_b_real") call timer%start("mult_at_b_real")
#endif #endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
...@@ -900,9 +919,9 @@ module ELPA1_compute ...@@ -900,9 +919,9 @@ module ELPA1_compute
enddo enddo
! Broadcast block column ! Broadcast block column
#ifdef WITH_MPI
call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr) call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
#endif
! Insert what we got in aux_mat ! Insert what we got in aux_mat
n_aux_bc = 0 n_aux_bc = 0
...@@ -947,8 +966,11 @@ module ELPA1_compute ...@@ -947,8 +966,11 @@ module ELPA1_compute
endif endif
! Sum up the results and send to processor row np ! Sum up the results and send to processor row np
#ifdef WITH_MPI
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr) call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
! Put the result into C ! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
...@@ -1189,9 +1211,11 @@ module ELPA1_compute ...@@ -1189,9 +1211,11 @@ module ELPA1_compute
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0. aux1(2) = 0.
endif endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1) vnorm2 = aux2(1)
vrl = aux2(2) vrl = aux2(2)
...@@ -1213,7 +1237,9 @@ module ELPA1_compute ...@@ -1213,7 +1237,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns ! Broadcast the Householder vector (and tau) along columns
if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1) tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc ! Transpose Householder vector vr -> vc
...@@ -1306,7 +1332,11 @@ module ELPA1_compute ...@@ -1306,7 +1332,11 @@ module ELPA1_compute
if (l_cols>0) then if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols) tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif endif
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & ! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
...@@ -1323,8 +1353,11 @@ module ELPA1_compute ...@@ -1323,8 +1353,11 @@ module ELPA1_compute
xc = 0 xc = 0
if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols)) if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr) call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = xc
#endif
! store u and v in the matrices U and V ! store u and v in the matrices U and V
! these matrices are stored combined in one here ! these matrices are stored combined in one here
...@@ -1376,9 +1409,13 @@ module ELPA1_compute ...@@ -1376,9 +1409,13 @@ module ELPA1_compute
e(1) = vrl e(1) = vrl
a(1,l_cols) = 1. ! for consistency only a(1,l_cols) = 1. ! for consistency only
endif endif
#ifdef WITH_MPI
call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr) call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr)
#endif
endif endif
#ifdef WITH_MPI