Unverified Commit 49f119aa authored by Andreas Marek's avatar Andreas Marek
Browse files

Optional build of ELPA without MPI

The configure flag "--enable-shared-memory-only" triggers a build
of ELPA without MPI support:

- all MPI calls are skipped (or overloaded)
- all calls to scalapack functions are replaced by the corresponding
  lapack calls
- all calls to blacs are skipped

Using ELPA without MPI gives the same results as using ELPA with 1 MPI
task!

This version is not yet optimized for performance, here and there some
unecessary copies are done.

Ths version is intended for users, who do not have MPI in their
application but still would like to use ELPA on one compute node
parent 29d84527
......@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -22,9 +24,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
......@@ -38,6 +40,13 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
......
......@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
......@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
......@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
......@@ -655,7 +668,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi
LT_INIT
......@@ -667,7 +680,7 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
......
......@@ -86,8 +86,10 @@ module ELPA1
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -110,7 +112,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
......@@ -328,6 +329,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
......
......@@ -57,6 +57,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! set default to private
......@@ -86,8 +87,6 @@ module ELPA1_compute
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
include 'mpif.h'
contains
#define DATATYPE REAL(kind=rk)
......@@ -174,7 +173,6 @@ module ELPA1_compute
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
......@@ -296,7 +294,11 @@ module ELPA1_compute
aux1(2) = 0.
endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -319,7 +321,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -408,7 +412,11 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
......@@ -419,8 +427,11 @@ module ELPA1_compute
x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = x
#endif
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -481,7 +492,7 @@ module ELPA1_compute
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop
endif
#ifdef WITH_MPI
tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = d
......@@ -490,6 +501,7 @@ module ELPA1_compute
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#endif
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage
......@@ -570,7 +582,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -654,9 +665,10 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
#endif
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -680,9 +692,11 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
if (nc>0) h2 = h1
#endif
! Calculate triangular matrix T
nc = 0
......@@ -702,7 +716,11 @@ module ELPA1_compute
else
tmp1(1:l_cols*nstor) = 0
endif
#ifdef WITH_MPI
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
......@@ -800,6 +818,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mult_at_b_real")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
......@@ -900,9 +919,9 @@ module ELPA1_compute
enddo
! Broadcast block column
#ifdef WITH_MPI
call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
#endif
! Insert what we got in aux_mat
n_aux_bc = 0
......@@ -947,8 +966,11 @@ module ELPA1_compute
endif
! Sum up the results and send to processor row np
#ifdef WITH_MPI
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -1189,9 +1211,11 @@ module ELPA1_compute
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -1213,7 +1237,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -1306,7 +1332,11 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
......@@ -1323,8 +1353,11 @@ module ELPA1_compute
xc = 0
if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = xc
#endif
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -1376,9 +1409,13 @@ module ELPA1_compute
e(1) = vrl
a(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr)
#endif
endif
#ifdef WITH_MPI
call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,pcol(2, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
......@@ -1394,7 +1431,7 @@ module ELPA1_compute
print *,"tridiag_complex: error when allocating tmpr "//errorMessage
stop
endif
#ifdef WITH_MPI
tmpr = d
call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmpr = d
......@@ -1403,6 +1440,7 @@ module ELPA1_compute
call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmpr = e
call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#endif
deallocate(tmpr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmpr "//errorMessage
......@@ -1477,12 +1515,12 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -1571,9 +1609,10 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
#endif
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -1597,9 +1636,11 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
if (nc>0) h2=h1
#endif
! Calculate triangular matrix T
nc = 0
......@@ -1619,7 +1660,11 @@ module ELPA1_compute
else
tmp1(1:l_cols*nstor) = 0
endif
#ifdef WITH_MPI
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &
......@@ -1718,11 +1763,11 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mult_ah_b_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
......@@ -1818,9 +1863,9 @@ module ELPA1_compute
enddo
! Broadcast block column
#ifdef WITH_MPI
call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr)
#endif
! Insert what we got in aux_mat
n_aux_bc = 0
......@@ -1865,8 +1910,11 @@ module ELPA1_compute
endif
! Sum up the results and send to processor row np
#ifdef WITH_MPI
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -1925,11 +1973,11 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
......@@ -2091,8 +2139,10 @@ module ELPA1_compute
! nlen-noff is always > nblk_ev
integer(kind=ik) :: np_off, nprocs
integer(kind=ik) :: np1, np2, noff, nlen, nmid, n, &
mpi_status(mpi_status_size)
integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(mpi_status_size)
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -2118,24 +2168,36 @@ module ELPA1_compute
nmid = limits(np_off+np1) - noff
nlen = limits(np_off+nprocs) - noff
#ifdef WITH_MPI
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
enddo
endif
#endif
if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
#ifdef WITH_MPI
call mpi_recv(d(noff+1),nmid,MPI_REAL8,np_off,1,mpi_comm_cols,mpi_status,mpierr)
#else
d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
#endif
endif
if (my_pcol==np_off+np1) then
do n=np_off,np_off+np1-1
#ifdef WITH_MPI
call mpi_send(d(noff+nmid+1),nlen-nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
#endif
enddo
endif
if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
#ifdef WITH_MPI
call mpi_recv(d(noff+nmid+1),nlen-nmid,MPI_REAL8,np_off+np1,1,mpi_comm_cols,mpi_status,mpierr)
#else
d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
#endif
endif
if (nprocs == np_cols) then
! Last merge, result distribution must be block cyclic, noff==0,
......@@ -2196,7 +2258,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
success = .true.
! Calculate the number of subdivisions needed.
......@@ -2296,11 +2357,13 @@ module ELPA1_compute
noff = limits(np)
nlen = limits(np+1)-noff
#ifdef WITH_MPI
call MPI_Bcast(d(noff+1),nlen,MPI_REAL8,np,mpi_comm_rows,mpierr)
#endif