Commit d9011373 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_GPU

parents bb63bd9e 9af253bc
......@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -26,9 +28,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.f90 \
src/elpa_qr/elpa_pdlarfb.f90 \
src/elpa_qr/elpa_pdlarfb.F90 \
src/elpa_qr/elpa_pdgeqrf.F90
if HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
......@@ -47,6 +49,13 @@ if WITH_GPU_VERSION
#src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if !WITH_MPI
libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
if !HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
endif
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90
endif
......@@ -370,6 +379,7 @@ distclean-local:
EXTRA_DIST = \
fdep/fortran_dependencies.pl \
fdep/fortran_dependencies.mk \
test/fortran_test_programs/elpa_test_programs_print_headers.X90 \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/redist_band.X90 \
......
......@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
......@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
......@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
......@@ -718,7 +731,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
......@@ -48,8 +48,9 @@ module mod_check_for_gpu
function check_for_gpu(myid, numberOfDevices, wantDebug) result(gpuAvailable)
use cuda_functions
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: myid
logical, optional, intent(in) :: wantDebug
logical :: success, wantDebugMessage
......@@ -80,6 +81,7 @@ module mod_check_for_gpu
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
......@@ -88,7 +90,7 @@ module mod_check_for_gpu
gpuAvailable = .false.
return
endif
#endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
......
......@@ -86,9 +86,11 @@ module ELPA1
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#endif
use iso_c_binding
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -111,7 +113,6 @@ module ELPA1
logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
include 'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
......@@ -330,6 +331,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
......
This diff is collapsed.
......@@ -80,6 +80,8 @@ module ELPA2
! use cuda_c_kernel
! use iso_c_binding
#endif
use elpa_mpi
implicit none
PRIVATE ! By default, all routines contained are private
......@@ -88,7 +90,6 @@ module ELPA2
public :: solve_evp_real_2stage
public :: solve_evp_complex_2stage
include 'mpif.h'
!******
contains
......@@ -177,6 +178,7 @@ contains
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_real_2stage")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
......@@ -295,7 +297,7 @@ contains
ttts = ttt0
call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
print *, "hier 1:", q(10,10)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
......@@ -312,10 +314,14 @@ contains
ttt0 = MPI_Wtime()
call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
print *, "hier 2:", q(10,10)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
......@@ -324,6 +330,7 @@ contains
call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
#endif /* WITH_MPI */
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......@@ -332,6 +339,8 @@ contains
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
print *, "hier 3:", q(10,10)
if (.not.(success)) return
ttt1 = MPI_Wtime()
......@@ -348,9 +357,12 @@ contains
! Backtransform stage 1
ttt0 = MPI_Wtime()
print *, "hier 4a:", q(10,10)
call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
print *, "hier 4:", q(10,10)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
......@@ -369,6 +381,8 @@ contains
ttt0 = MPI_Wtime()
call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
print *, "hier 5:", q(10,10)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
......@@ -461,6 +475,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_complex_2stage")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
......@@ -545,6 +560,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop()
#endif
......@@ -566,10 +582,13 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
ttt0 = MPI_Wtime()
call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr)
......@@ -577,6 +596,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr)
#endif
#endif /* WITH_MPI */
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......@@ -631,8 +652,6 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
stop
endif
! Backtransform stage 2
ttt0 = MPI_Wtime()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -110,9 +110,9 @@
subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......@@ -387,9 +387,9 @@
subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......@@ -629,9 +629,9 @@
subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s)
use precision
use elpa_mpi
implicit none
include 'mpif.h'
integer(kind=ik), intent(in) :: nb, ldq, ldh
complex(kind=ck), intent(inout) :: q(ldq/2,*)
......
......@@ -48,7 +48,7 @@ module elpa_pdgeqrf
use elpa1_compute
use elpa_pdlarfb
use qr_utils_mod
use elpa_mpi
implicit none
PRIVATE
......@@ -57,7 +57,6 @@ module elpa_pdgeqrf
public :: qr_pqrparam_init
public :: qr_pdlarfg2_1dcomm_check
include 'mpif.h'
contains
......@@ -120,7 +119,6 @@ module elpa_pdgeqrf
! copy value before we are going to filter it
total_cols = n
call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
......@@ -235,9 +233,10 @@ module elpa_pdgeqrf
!end if
! initiate broadcast (send part)
#ifdef WITH_MPI
call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
mpirank_cols_qr,mpicomm_cols,mpierr)
#endif
! copy tau parts into temporary tau buffer
work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
......@@ -257,9 +256,10 @@ module elpa_pdgeqrf
broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
! initiate broadcast (recv part)
#ifdef WITH_MPI
call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
mpirank_cols_qr,mpicomm_cols,mpierr)
#endif
! last n*n elements in buffer are (still empty) T matrix elements
! fetch from first process in each column
......@@ -530,10 +530,8 @@ module elpa_pdgeqrf
maxrank = min(PQRPARAM(1),n)
updatemode = PQRPARAM(2)
hgmode = PQRPARAM(4)
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
if (trans .eq. 1) then
incx = lda
else
......@@ -713,10 +711,8 @@ module elpa_pdgeqrf
#endif
return
end if
call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)
! calculate expected work size and store in work(1)
if (hgmode .eq. ichar('s')) then
! allreduce (MPI_SUM)
......@@ -770,11 +766,13 @@ module elpa_pdgeqrf
work(1) = alpha
work(2) = dot
#ifdef WITH_MPI
call mpi_allreduce(work(1),work(sendsize+1), &
sendsize,mpi_real8,mpi_sum, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
#endif
alpha = work(sendsize+1)
xnorm = sqrt(work(sendsize+2))
else if (hgmode .eq. ichar('x')) then
......@@ -790,11 +788,13 @@ module elpa_pdgeqrf
work(2*iproc+1) = alpha
work(2*iproc+2) = xnorm
end do
#ifdef WITH_MPI
call mpi_alltoall(work(1),2,mpi_real8, &
work(sendsize+1),2,mpi_real8, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+2-1) = work(1:2)
#endif
! extract alpha value
alpha = work(sendsize+1+mpirank_top*2)
......@@ -816,10 +816,13 @@ module elpa_pdgeqrf
work(2) = xnorm
! allgather
#ifdef WITH_MPI
call mpi_allgather(work(1),sendsize,mpi_real8, &
work(sendsize+1),sendsize,mpi_real8, &
mpi_comm,mpierr)
#else
work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
#endif
! extract alpha value
alpha = work(sendsize+1+mpirank_top*2)
......@@ -1040,10 +1043,8 @@ module elpa_pdgeqrf
#endif
return
end if
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
local_size1,baseoffset,local_offset1)
......@@ -1088,8 +1089,13 @@ module elpa_pdgeqrf
work(8) = 0.0d0 ! fill up buffer
! exchange partial results
#ifdef WITH_MPI
call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
mpicomm, mpierr)
#else
seed(1:8) = work(1:8)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("qr_pdlarfg2_1dcomm_seed")
#endif
......@@ -1188,10 +1194,8 @@ module elpa_pdgeqrf
call timer%start("qr_pdlarfg2_1dcomm_vector")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
local_size,baseoffset,local_offset)
......@@ -1269,10 +1273,10 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("qr_pdlarfg2_1dcomm_update")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
! seed should be updated by previous householder generation
! Update inner product of this column and next column vector
top11 = seed(1)
......@@ -1503,9 +1507,9 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("qr_pdlarfgk_1dcomm_seed")
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
C_size = k*k
D_size = k*k
sendoffset = 1
......@@ -1571,9 +1575,12 @@ module elpa_pdgeqrf
! TODO: store symmetric part more efficiently
! allreduce operation on results
#ifdef WITH_MPI
call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
mpi_real8,mpi_sum,mpicomm,mpierr)
#else
work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
#endif
! unpack result from buffer into seedC and seedD
seedC(1:k,1:k) = 0.0d0
do icol=1,k
......@@ -1918,7 +1925,6 @@ module elpa_pdgeqrf
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
lidx = baseidx-sidx+1
call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
local_size,baseoffset,local_offset)
......@@ -2024,7 +2030,6 @@ module elpa_pdgeqrf
endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
lidx = baseidx-sidx
if (lidx .lt. 1) then
#ifdef HAVE_DETAILED_TIMINGS
......@@ -2180,7 +2185,6 @@ module elpa_pdgeqrf
#endif
call mpi_comm_rank(mpicomm,mpirank,mpierr)
call mpi_comm_size(mpicomm,mpiprocs,mpierr)
call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
local_size,baseoffset,offset)
......@@ -2385,7 +2389,6 @@ module elpa_pdgeqrf
#endif
call MPI_Comm_rank(mpicomm, mpirank, mpierr)
call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
local_size,v_offset,x_offset)
v_offset = v_offset * incv
......
......@@ -40,11 +40,13 @@
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
module elpa_pdlarfb
use elpa1_compute
use qr_utils_mod
use elpa_mpi
implicit none
PRIVATE
......@@ -57,8 +59,6 @@ module elpa_pdlarfb
public :: qr_pdlarfl2_tmatrix_1dcomm
public :: qr_tmerge_pdlarfb_1dcomm