Optional build of ELPA without MPI

The configure flag "--enable-shared-memory-only" triggers a build
of ELPA without MPI support:

- all MPI calls are skipped (or overloaded)
- all calls to scalapack functions are replaced by the corresponding
  lapack calls
- all calls to blacs are skipped

Using ELPA without MPI gives the same results as using ELPA with 1 MPI
task!

This version is not yet optimized for performance, here and there some
unecessary copies are done.

Ths version is intended for users, who do not have MPI in their
application but still would like to use ELPA on one compute node
parent 29d84527
......@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -22,9 +24,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
......@@ -38,6 +40,13 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
......
......@@ -68,13 +68,26 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
......@@ -87,12 +100,10 @@ AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
......@@ -386,6 +397,7 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
......@@ -416,6 +428,7 @@ else
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
AC_MSG_CHECKING([whether we can link a Fortran program with all blacs/scalapack])
......@@ -667,7 +680,7 @@ DX_HTML_FEATURE(ON)
DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
DESPERATELY_WANT_ASSUMED_SIZE=0
if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
fi
......
......@@ -88,6 +88,8 @@ module ELPA1
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -110,7 +112,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
......@@ -328,6 +329,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
......
......@@ -57,6 +57,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! set default to private
......@@ -86,8 +87,6 @@ module ELPA1_compute
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
include 'mpif.h'
contains
#define DATATYPE REAL(kind=rk)
......@@ -174,7 +173,6 @@ module ELPA1_compute
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
......@@ -296,7 +294,11 @@ module ELPA1_compute
aux1(2) = 0.
endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -319,7 +321,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -408,7 +412,11 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
......@@ -419,8 +427,11 @@ module ELPA1_compute
x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = x
#endif
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -481,7 +492,7 @@ module ELPA1_compute
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop
endif
#ifdef WITH_MPI
tmp = d
call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = d
......@@ -490,6 +501,7 @@ module ELPA1_compute
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmp = e
call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#endif
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage
......@@ -570,7 +582,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -654,9 +665,10 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
#endif
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -680,9 +692,11 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
if (nc>0) h2 = h1
#endif
! Calculate triangular matrix T
nc = 0
......@@ -702,7 +716,11 @@ module ELPA1_compute
else
tmp1(1:l_cols*nstor) = 0
endif
#ifdef WITH_MPI
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
......@@ -800,6 +818,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mult_at_b_real")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
......@@ -900,9 +919,9 @@ module ELPA1_compute
enddo
! Broadcast block column
#ifdef WITH_MPI
call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
#endif
! Insert what we got in aux_mat
n_aux_bc = 0
......@@ -947,8 +966,11 @@ module ELPA1_compute
endif
! Sum up the results and send to processor row np
#ifdef WITH_MPI
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -1189,9 +1211,11 @@ module ELPA1_compute
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
aux2 = aux1
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -1213,7 +1237,9 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -1306,7 +1332,11 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
uc = tmp
#endif
endif
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
......@@ -1323,8 +1353,11 @@ module ELPA1_compute
xc = 0
if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr)
#else
vav = xc
#endif
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -1376,9 +1409,13 @@ module ELPA1_compute
e(1) = vrl
a(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr)
#endif
endif
#ifdef WITH_MPI
call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,pcol(2, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
......@@ -1394,7 +1431,7 @@ module ELPA1_compute
print *,"tridiag_complex: error when allocating tmpr "//errorMessage
stop
endif
#ifdef WITH_MPI
tmpr = d
call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmpr = d
......@@ -1403,6 +1440,7 @@ module ELPA1_compute
call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
tmpr = e
call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
#endif
deallocate(tmpr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmpr "//errorMessage
......@@ -1477,12 +1515,12 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -1571,9 +1609,10 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
#endif
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -1597,9 +1636,11 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
if (nc>0) h2=h1
#endif
! Calculate triangular matrix T
nc = 0
......@@ -1619,7 +1660,11 @@ module ELPA1_compute
else
tmp1(1:l_cols*nstor) = 0
endif
#ifdef WITH_MPI
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &
......@@ -1718,11 +1763,11 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mult_ah_b_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b
l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
......@@ -1818,9 +1863,9 @@ module ELPA1_compute
enddo
! Broadcast block column
#ifdef WITH_MPI
call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr)
#endif
! Insert what we got in aux_mat
n_aux_bc = 0
......@@ -1865,8 +1910,11 @@ module ELPA1_compute
endif
! Sum up the results and send to processor row np
#ifdef WITH_MPI
call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr)
#else
tmp2 = tmp1
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -1925,11 +1973,11 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_tridi")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
......@@ -2091,8 +2139,10 @@ module ELPA1_compute
! nlen-noff is always > nblk_ev
integer(kind=ik) :: np_off, nprocs
integer(kind=ik) :: np1, np2, noff, nlen, nmid, n, &
mpi_status(mpi_status_size)
integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(mpi_status_size)
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -2118,24 +2168,36 @@ module ELPA1_compute
nmid = limits(np_off+np1) - noff
nlen = limits(np_off+nprocs) - noff
#ifdef WITH_MPI
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
enddo
endif
#endif
if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
#ifdef WITH_MPI
call mpi_recv(d(noff+1),nmid,MPI_REAL8,np_off,1,mpi_comm_cols,mpi_status,mpierr)
#else
d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
#endif
endif
if (my_pcol==np_off+np1) then
do n=np_off,np_off+np1-1
#ifdef WITH_MPI
call mpi_send(d(noff+nmid+1),nlen-nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
#endif
enddo
endif
if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
#ifdef WITH_MPI
call mpi_recv(d(noff+nmid+1),nlen-nmid,MPI_REAL8,np_off+np1,1,mpi_comm_cols,mpi_status,mpierr)
#else
d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
#endif
endif
if (nprocs == np_cols) then
! Last merge, result distribution must be block cyclic, noff==0,
......@@ -2196,7 +2258,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
success = .true.
! Calculate the number of subdivisions needed.
......@@ -2296,11 +2357,13 @@ module ELPA1_compute
noff = limits(np)
nlen = limits(np+1)-noff
#ifdef WITH_MPI
call MPI_Bcast(d(noff+1),nlen,MPI_REAL8,np,mpi_comm_rows,mpierr)
#endif
qmat2 = qmat1
#ifdef WITH_MPI
call MPI_Bcast(qmat2,max_size*max_size,MPI_REAL8,np,mpi_comm_rows,mpierr)
#endif
do i=1,nlen
call distribute_global_column(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
enddo
......@@ -2525,7 +2588,10 @@ module ELPA1_compute
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr, mpi_status(mpi_status_size)
np_cols, mpierr
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(mpi_status_size)
#endif
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
......@@ -2538,8 +2604,7 @@ module ELPA1_compute
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads, omp_get_thread_num
integer(kind=ik) :: istat
character(200) :: errorMessage
max_threads = omp_get_max_threads()
......@@ -2559,7 +2624,6 @@ module ELPA1_compute
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! If my processor column isn't in the requested set, do nothing
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
......@@ -3080,10 +3144,11 @@ module ELPA1_compute
else
np_rem = np_rem-1
endif
#ifdef WITH_MPI
call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, &
np_next, 1111, np_prev, 1111, &
mpi_comm_cols, mpi_status, mpierr)
#endif
endif
! Gather the parts in d1 and z which are fitting to qtmp1.
......@@ -3263,10 +3328,16 @@ module ELPA1_compute
! send and recieve column are local
qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
else
#ifdef WITH_MPI
call mpi_send(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,mod(i,4096),mpi_comm_cols,mpierr)
#endif
endif
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call mpi_recv(qtmp(1,nc),l_rows,MPI_REAL8,pc1,mod(i,4096),mpi_comm_cols,mpi_status,mpierr)
#else
qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,nc)
#endif
endif
enddo
......@@ -3313,15 +3384,23 @@ module ELPA1_compute
q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
else
#ifdef WITH_MPI
call mpi_sendrecv(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,1, &
tmp,l_rows,MPI_REAL8,pc2,1, &
mpi_comm_cols,mpi_status,mpierr)
#else
tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
#endif
q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
endif
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call mpi_sendrecv(q(l_rqs