Commit d9011373 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents bb63bd9e 9af253bc
......@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -26,9 +28,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
......@@ -47,6 +49,13 @@ if WITH_GPU_VERSION
#src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
......@@ -370,6 +379,7 @@ distclean-local:
EXTRA_DIST = \
fdep/fortran_dependencies.pl \
fdep/fortran_dependencies.mk \
test/fortran_test_programs/elpa_test_programs_print_headers.X90 \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/redist_band.X90 \
......
......@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
......@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
......@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
......@@ -718,7 +731,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
......@@ -48,8 +48,9 @@ module mod_check_for_gpu
function check_for_gpu(myid, numberOfDevices, wantDebug) result(gpuAvailable)
use cuda_functions
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: myid
logical, optional, intent(in) :: wantDebug
logical :: success, wantDebugMessage
......@@ -80,6 +81,7 @@ module mod_check_for_gpu
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
......@@ -88,7 +90,7 @@ module mod_check_for_gpu
gpuAvailable = .false.
return
endif
#endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
......
......@@ -86,9 +86,11 @@ module ELPA1
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#endif
use iso_c_binding
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -111,7 +113,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
......@@ -330,6 +331,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
......
......@@ -57,6 +57,7 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use elpa_mpi
implicit none
PRIVATE ! set default to private
......@@ -86,8 +87,6 @@ module ELPA1_compute
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
include 'mpif.h'
contains
#ifdef DOUBLE_PRECISION_REAL
......@@ -316,11 +315,17 @@ module ELPA1_compute
aux1(2) = 0.
endif
#ifdef WITH_MPI
#if DOUBLE_PRECISION_REAL
call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -342,11 +347,15 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -458,12 +467,18 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
uc = tmp
#endif /* WITH_MPI */
endif
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
......@@ -474,11 +489,19 @@ module ELPA1_compute
x = 0
if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
#else
call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#else /* WITH_MPI */
vav = x
#endif /* WITH_MPI */
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -546,6 +569,8 @@ module ELPA1_compute
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
tmp = d
call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
......@@ -565,6 +590,8 @@ module ELPA1_compute
tmp = e
call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage
......@@ -645,7 +672,6 @@ module ELPA1_compute
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -729,12 +755,15 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -763,12 +792,17 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
if (nc>0) h2 = h1
#endif /* WITH_MPI */
! Calculate triangular matrix T
nc = 0
......@@ -798,21 +832,33 @@ module ELPA1_compute
else
tmp1(1:l_cols*nstor) = 0
endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk, q, ldq)
endif
#else
#else /* DOUBLE_PRECISION_REAL */
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk, q, ldq)
endif
#endif
#endif /* DOUBLE_PRECISION_REAL */
nstor = 0
endif
......@@ -933,6 +979,7 @@ module ELPA1_compute
print *,"na lt ldcCols ",na,ldcCols
stop
endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
......@@ -1033,12 +1080,15 @@ module ELPA1_compute
enddo
! Broadcast block column
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
! Insert what we got in aux_mat
n_aux_bc = 0
......@@ -1089,11 +1139,17 @@ module ELPA1_compute
endif
! Sum up the results and send to processor row np
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL8, MPI_SUM, np, mpi_comm_rows, mpierr)
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
......@@ -1353,11 +1409,17 @@ module ELPA1_compute
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -1379,11 +1441,15 @@ module ELPA1_compute
! Broadcast the Householder vector (and tau) along columns
if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(vr, l_rows+1, MPI_DOUBLE_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, l_rows+1, MPI_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
......@@ -1501,12 +1567,17 @@ module ELPA1_compute
if (l_cols>0) then
tmp(1:l_cols) = uc(1:l_cols)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(tmp, uc, l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp, uc, l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
uc = tmp
#endif /* WITH_MPI */
endif
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
......@@ -1523,12 +1594,18 @@ module ELPA1_compute
xc = 0
if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(xc, vav, 1 , MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr)
#else
call mpi_allreduce(xc, vav, 1 , MPI_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#else /* WITH_MPI */
vav = xc
#endif /* WITH_MPI */
! store u and v in the matrices U and V
! these matrices are stored combined in one here
......@@ -1586,18 +1663,29 @@ module ELPA1_compute
e(1) = vrl
a(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
#else
call mpi_bcast(tau(2), 1, MPI_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
#endif
#endif /* WITH_MPI */
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call mpi_bcast(tau(2), 1, MPI_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage)
......@@ -1613,6 +1701,8 @@ module ELPA1_compute
stop
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
tmpr = d
call mpi_allreduce(tmpr, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
......@@ -1633,6 +1723,7 @@ module ELPA1_compute
call mpi_allreduce(tmpr, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
deallocate(tmpr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmpr "//errorMessage
......@@ -1707,12 +1798,12 @@ module ELPA1_compute
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -1801,12 +1892,15 @@ module ELPA1_compute
nb = nb+l_rows
enddo
#ifdef WITH_MPI
if (nb>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(hvb, nb, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb, nb, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
nb = 0
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
......@@ -1833,12 +1927,17 @@ module ELPA1_compute
h1(nc+1:nc+n) = tmat(1:n,n+1)
nc = nc+n
enddo
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)