Commit 49f119aa authored by Andreas Marek's avatar Andreas Marek

Optional build of ELPA without MPI

The configure flag "--enable-shared-memory-only" triggers a build
of ELPA without MPI support:

- all MPI calls are skipped (or overloaded)
- all calls to scalapack functions are replaced by the corresponding
  lapack calls
- all calls to blacs are skipped

Using ELPA without MPI gives the same results as using ELPA with 1 MPI
task!

This version is not yet optimized for performance, here and there some
unecessary copies are done.

Ths version is intended for users, who do not have MPI in their
application but still would like to use ELPA on one compute node
parent 29d84527
......@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -22,9 +24,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
......@@ -38,6 +40,13 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
......
......@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
......@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
......@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
......@@ -655,7 +668,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi
LT_INIT
......@@ -667,7 +680,7 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
......
......@@ -86,8 +86,10 @@ module ELPA1
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -110,7 +112,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
......@@ -328,6 +329,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
......
This diff is collapsed.
......@@ -73,6 +73,7 @@ module ELPA2
use elpa2_compute
use elpa_pdgeqrf
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -82,7 +83,6 @@ module ELPA2
public :: solve_evp_real_2stage
public :: solve_evp_complex_2stage
include 'mpif.h'
!******
contains
......@@ -170,7 +170,6 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
......@@ -269,10 +268,10 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0
#ifdef WITH_MPI
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
#endif
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......@@ -399,7 +398,6 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
......@@ -473,10 +471,10 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0
#ifdef WITH_MPI
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
#endif
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......
This diff is collapsed.
......@@ -110,9 +110,9 @@
subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......@@ -387,9 +387,9 @@
subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......@@ -629,9 +629,9 @@
subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......
......@@ -48,7 +48,7 @@ module elpa_pdgeqrf
use elpa1_compute
use elpa_pdlarfb
use qr_utils_mod
use elpa_mpi
implicit none
PRIVATE
......@@ -57,7 +57,6 @@ module elpa_pdgeqrf
public :: qr_pqrparam_init
public :: qr_pdlarfg2_1dcomm_check
include 'mpif.h'
contains
......@@ -120,7 +119,6 @@ module elpa_pdgeqrf
! copy value before we are going to filter it
total_cols = n
call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
......@@ -235,9 +233,10 @@ module elpa_pdgeqrf
!end if
! initiate broadcast (send part)
#ifdef WITH_MPI
call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
mpirank_cols_qr,mpicomm_cols,mpierr)
#endif
! copy tau parts into temporary tau buffer
work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
......@@ -257,9 +256,10 @@ module elpa_pdgeqrf
broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
! initiate broadcast (recv part)
#ifdef WITH_MPI
call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
mpirank_cols_qr,mpicomm_cols,mpierr)
#endif
! last n*n elements in buffer are (still empty) T matrix elements
! fetch from first process in each column
......@@ -530,10 +530,8 @@ module elpa_pdgeqrf
maxrank = min(PQRPARAM(1),n)
updatemode = PQRPARAM(2)
hgmode = PQRPARAM(4)
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
if (trans .eq. 1) then
incx = lda
else
......@@ -713,10 +711,8 @@ module elpa_pdgeqrf
#endif
return
end if
call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)
! calculate expected work size and store in work(1)
if (hgmode .eq. ichar('s')) then
! allreduce (MPI_SUM)
......@@ -770,11 +766,13 @@ module elpa_pdgeqrf
work(1) = alpha
work(2) = dot
#ifdef WITH_MPI
call mpi_allreduce(work(1),work(sendsize+1), &
sendsize,mpi_real8,mpi_sum, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
#endif
alpha = work(sendsize+1)
xnorm = sqrt(work(sendsize+2))
else if (hgmode .eq. ichar('x')) then
......@@ -790,11 +788,13 @@ module elpa_pdgeqrf
work(2*iproc+1) = alpha
work(2*iproc+2) = xnorm
end do
#ifdef WITH_MPI
call mpi_alltoall(work(1),2,mpi_real8, &
work(sendsize+1),2,mpi_real8, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+2-1) = work(1:2)
#endif
! extract alpha value
alpha = work(sendsize+1+mpirank_top*2)
......@@ -816,10 +816,13 @@ module elpa_pdgeqrf
work(2) = xnorm
! allgather
#ifdef WITH_MPI
call mpi_allgather(work(1),sendsize,mpi_real8, &
work(sendsize+1),sendsize,mpi_real8, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
#endif
! extract alpha value
alpha = work(sendsize+1+mpirank_top*2)
......@@ -1040,10 +1043,8 @@ module elpa_pdgeqrf
#endif
return
end if
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
local_size1,baseoffset,local_offset1)
......@@ -1088,8 +1089,13 @@ module elpa_pdgeqrf
work(8) = 0.0d0 ! fill up buffer
! exchange partial results
#ifdef WITH_MPI
call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
mpicomm, mpierr)
#else
seed(1:8) = work(1:8)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("qr_pdlarfg2_1dcomm_seed")
#endif
......@@ -1188,10 +1194,8 @@ module elpa_pdgeqrf
call timer%start("qr_pdlarfg2_1dcomm_vector")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
local_size,baseoffset,local_offset)
......@@ -1269,10 +1273,10 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("qr_pdlarfg2_1dcomm_update")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
! seed should be updated by previous householder generation
! Update inner product of this column and next column vector
top11 = seed(1)
......@@ -1503,9 +1507,9 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("qr_pdlarfgk_1dcomm_seed")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
C_size = k*k
D_size = k*k
sendoffset = 1
......@@ -1571,9 +1575,12 @@ module elpa_pdgeqrf
! TODO: store symmetric part more efficiently
! allreduce operation on results
#ifdef WITH_MPI
call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
#endif
! unpack result from buffer into seedC and seedD
seedC(1:k,1:k) = 0.0d0
do icol=1,k
......@@ -1918,7 +1925,6 @@ module elpa_pdgeqrf
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
lidx = baseidx-sidx+1
call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
local_size,baseoffset,local_offset)
......@@ -2024,7 +2030,6 @@ module elpa_pdgeqrf
endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
lidx = baseidx-sidx
if (lidx .lt. 1) then
#ifdef HAVE_DETAILED_TIMINGS
......@@ -2180,7 +2185,6 @@ module elpa_pdgeqrf
#endif
call mpi_comm_rank(mpicomm,mpirank,mpierr)
call mpi_comm_size(mpicomm,mpiprocs,mpierr)
call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
local_size,baseoffset,offset)
......@@ -2385,7 +2389,6 @@ module elpa_pdgeqrf
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
local_size,v_offset,x_offset)
v_offset = v_offset * incv
......
......@@ -40,11 +40,13 @@
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
module elpa_pdlarfb
use elpa1_compute
use qr_utils_mod
use elpa_mpi
implicit none
PRIVATE
......@@ -57,8 +59,6 @@ module elpa_pdlarfb
public :: qr_pdlarfl2_tmatrix_1dcomm
public :: qr_tmerge_pdlarfb_1dcomm
include 'mpif.h'
contains
subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork)
......@@ -102,10 +102,8 @@ subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpic
end if
!print *,'updating trailing matrix with k=',k
call MPI_Comm_rank(mpicomm,mpirank,mpierr)
call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
! use baseidx as idx here, otherwise the upper triangle part will be lost
! during the calculation, especially in the reversed case
call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
......@@ -119,8 +117,11 @@ subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpic
end if
! data exchange
#ifdef WITH_MPI
call mpi_allreduce(work(1,1),work(1,n+1),k*n,mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(1:k*n,n+1) = work(1:k*n,1)
#endif
call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t,ldt,work(1,n+1),k)
end subroutine qr_pdlarfb_1dcomm
......@@ -158,10 +159,8 @@ subroutine qr_pdlarft_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx
work(1,1) = DBLE(2*(k*k+k*n+oldk))
return
end if
call MPI_Comm_rank(mpicomm,mpirank,mpierr)
call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
localsize,baseoffset,offset)
......@@ -180,8 +179,11 @@ subroutine qr_pdlarft_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx
end if
! exchange data
#ifdef WITH_MPI
call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(1:sendsize,recvoffset) = work(1:sendsize,sendoffset)
#endif
! generate T matrix (pdlarft)
t(1:k,1:k) = 0.0d0 ! DEBUG: clear buffer first
......@@ -231,10 +233,8 @@ subroutine qr_pdlarft_set_merge_1dcomm(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,
work(1,1) = DBLE(2*n*n)
return
end if
call MPI_Comm_rank(mpicomm,mpirank,mpierr)
call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
localsize,baseoffset,offset)
......@@ -243,9 +243,11 @@ subroutine qr_pdlarft_set_merge_1dcomm(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,
else
work(1:n,1:n) = 0.0d0
end if
#ifdef WITH_MPI
call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
#endif
! skip Y4'*Y4 part
offset = mod(n,blocksize)
if (offset .eq. 0) offset=blocksize
......@@ -280,10 +282,8 @@ subroutine qr_pdlarft_tree_merge_1dcomm(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,b
end if
if (n .le. blocksize) return ! nothing to do
call MPI_Comm_rank(mpicomm,mpirank,mpierr)
call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
localsize,baseoffset,offset)
......@@ -292,9 +292,11 @@ subroutine qr_pdlarft_tree_merge_1dcomm(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,b
else
work(1:n,1:n) = 0.0d0
end if
#ifdef WITH_MPI
call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
#endif
! skip Y4'*Y4 part
offset = mod(n,blocksize)
if (offset .eq. 0) offset=blocksize
......@@ -330,10 +332,8 @@ subroutine qr_pdlarfl_1dcomm(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,
! external functions
real(kind=rk), external :: ddot
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
sendsize = n
recvsize = sendsize
......@@ -362,9 +362,11 @@ subroutine qr_pdlarfl_1dcomm(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,
else
work(1:n) = 0.0d0
end if
#ifdef WITH_MPI
call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
#else
work(sendsize+1:sendsize+1+sendsize+1+sendsize-1) = work(1:sendsize)
#endif
if (local_size > 0) then
do icol=1,n
......@@ -406,10 +408,8 @@ subroutine qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,i
! external functions
real(kind=rk), external :: ddot
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
sendsize = 2*n
recvsize = sendsize
......@@ -445,9 +445,11 @@ subroutine qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,i
call dgemv("Trans",local_size1,n,1.0d0,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0d0,work(dgemv1_offset),1)
call dgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0d0, &
work(dgemv2_offset),1)
#ifdef WITH_MPI
call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
#else
work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
#endif
! update second vector
call daxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1)
......@@ -536,10 +538,8 @@ subroutine qr_tmerge_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,
work(1) = DBLE(2*sendsize)
return
end if
call MPI_Comm_rank(mpicomm,mpirank,mpierr)
call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
! use baseidx as idx here, otherwise the upper triangle part will be lost
! during the calculation, especially in the reversed case
call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
......@@ -605,8 +605,11 @@ subroutine qr_tmerge_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,
if (sendsize .le. 0) return ! nothing to do
! exchange data
#ifdef WITH_MPI
call mpi_allreduce(work(sendoffset),work(recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(recvoffset:recvoffset+sendsize-1) = work(sendoffset:sendoffset+sendsize-1)
#endif
updateoffset = recvoffset+updateoffset
mergeoffset = recvoffset+mergeoffset
tgenoffset = recvoffset+tgenoffset
......
......@@ -40,8 +40,10 @@
! the original distribution, the GNU Lesser General Public License.
!
!
module qr_utils_mod
#include "config-f90.h"
module qr_utils_mod
use elpa_mpi
implicit none
PRIVATE
......@@ -185,12 +187,10 @@ subroutine reverse_matrix_2dcomm_ref(m,n,mb,nb,a,lda,work,lwork,mpicomm_cols,mpi
integer(kind=ik) :: mpiprocs_cols,mpiprocs_rows
integer(kind=ik) :: mpierr