Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
d9011373
Commit
d9011373
authored
Feb 26, 2016
by
Andreas Marek
Browse files
Merge branch 'master' into ELPA_GPU
parents
bb63bd9e
9af253bc
Changes
41
Expand all
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
d9011373
...
...
@@ -10,6 +10,8 @@ lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@
_la_LINK
=
$(FCLINK)
$(AM_LDFLAGS)
-version-info
$(ELPA_SO_VERSION)
-lstdc
++
libelpa@SUFFIX@
_la_SOURCES
=
src/mod_precision.F90
\
src/mod_mpi.F90
\
src/mod_mpi_stubs.F90
\
src/elpa_utilities.F90
\
src/elpa1_compute.F90
\
src/elpa1.F90
\
...
...
@@ -26,9 +28,9 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/elpa2_compute.F90
\
src/elpa2.F90
\
src/elpa_c_interface.F90
\
src/elpa_qr/qr_utils.
f
90
\
src/elpa_qr/qr_utils.
F
90
\
src/elpa_qr/elpa_qrkernels.f90
\
src/elpa_qr/elpa_pdlarfb.
f
90
\
src/elpa_qr/elpa_pdlarfb.
F
90
\
src/elpa_qr/elpa_pdgeqrf.F90
if
HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@
_la_SOURCES
+=
src/timer.F90
\
...
...
@@ -47,6 +49,13 @@ if WITH_GPU_VERSION
#src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if
!WITH_MPI
libelpa@SUFFIX@
_la_SOURCES
+=
src/mod_time_c.F90
if
!HAVE_DETAILED_TIMINGS
libelpa@SUFFIX@
_la_SOURCES
+=
src/ftimings/time.c
endif
endif
if
WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real.F90
endif
...
...
@@ -370,6 +379,7 @@ distclean-local:
EXTRA_DIST
=
\
fdep/fortran_dependencies.pl
\
fdep/fortran_dependencies.mk
\
test
/fortran_test_programs/elpa_test_programs_print_headers.X90
\
src/elpa_reduce_add_vectors.X90
\
src/elpa_transpose_vectors.X90
\
src/redist_band.X90
\
...
...
configure.ac
View file @
d9011373
...
...
@@ -68,37 +68,48 @@ if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
fi
AC_MSG_CHECKING(whether --enable-shared-memory-only is specified)
AC_ARG_ENABLE([shared-memory-only],
AS_HELP_STRING([--enable-shared-memory-only],
[do not use MPI; ELPA will be build for one node shared-memory runs only]),
[],
[enable_shared_memory_only=no])
AC_MSG_RESULT([${enable_shared_memory_only}])
AM_CONDITIONAL([WITH_MPI],[test x"$enable_shared_memory_only" = x"no"])
if test x"${enable_shared_memory_only}" = x"no"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl check whether mpi compilers are available;
dnl if not abort since it is mandatory
# C
AC_LANG([C])
m4_include([m4/ax_prog_cc_mpi.m4
])
AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
AX_PROG_CC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no
])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
fi
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AC_PROG_INSTALL
AM_PROG_AR
AM_PROG_AS
# Fortran
AC_LANG([Fortran])
m4_include([m4/ax_prog_fc_mpi.m4])
AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
AX_PROG_FC_MPI([test x"$enable_shared_memory_only" = xno],[use_mpi=yes],[use_mpi=no])
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
fi
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
...
...
@@ -106,11 +117,11 @@ AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
...
...
@@ -386,35 +397,37 @@ else
AC_MSG_ERROR([could not link with lapack: specify path])
fi
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${enable_shared_memory_only}" = x"no"; then
dnl test whether scalapack already contains blacs
scalapack_libs="mpiscalapack scalapack"
old_LIBS="$LIBS"
for lib in ${scalapack_libs}; do
LIBS="-l${lib} ${old_LIBS}"
AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
AC_MSG_RESULT([${blacs_in_scalapack}])
if test x"${blacs_in_scalapack}" = x"yes"; then
break
fi
done
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
if test x"${blacs_in_scalapack}" = x"no"; then
LIBS="${old_LIBS}"
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
dnl Test for stand-alone blacs
AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
if test x"${have_blacs}" = x"no"; then
AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
fi
fi
fi
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
if test x"${have_scalapack}" = x"no" ; then
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
dnl check whether we can link alltogehter
...
...
@@ -718,7 +731,7 @@ if test x"${use_specific_complex_kernel}" = x"no" ; then
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
fi
LT_INIT
...
...
src/check_for_gpu.F90
View file @
d9011373
...
...
@@ -48,8 +48,9 @@ module mod_check_for_gpu
function
check_for_gpu
(
myid
,
numberOfDevices
,
wantDebug
)
result
(
gpuAvailable
)
use
cuda_functions
use
precision
use
elpa_mpi
implicit
none
include
'mpif.h'
integer
(
kind
=
ik
),
intent
(
in
)
::
myid
logical
,
optional
,
intent
(
in
)
::
wantDebug
logical
::
success
,
wantDebugMessage
...
...
@@ -80,6 +81,7 @@ module mod_check_for_gpu
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call
mpi_allreduce
(
numberOfDevices
,
maxNumberOfDevices
,
1
,
MPI_INTEGER
,
MPI_MAX
,
MPI_COMM_WORLD
,
mpierr
)
if
(
maxNumberOfDevices
.ne.
numberOfDevices
)
then
...
...
@@ -88,7 +90,7 @@ module mod_check_for_gpu
gpuAvailable
=
.false.
return
endif
#endif
if
(
numberOfDevices
.ne.
0
)
then
gpuAvailable
=
.true.
! Usage of GPU is possible since devices have been detected
...
...
src/elpa1.F90
View file @
d9011373
...
...
@@ -86,9 +86,11 @@ module ELPA1
use
elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use
timings
use
timings
#endif
use
iso_c_binding
use
elpa_mpi
implicit
none
PRIVATE
! By default, all routines contained are private
...
...
@@ -111,7 +113,6 @@ module ELPA1
logical
,
public
::
elpa_print_times
=
.false.
!< Set elpa_print_times to .true. for explicit timing outputs
include
'mpif.h'
!> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
!> \detail
...
...
@@ -330,6 +331,7 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
ttt0
=
MPI_Wtime
()
call
tridiag_real
(
na
,
a
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
ev
,
e
,
tau
)
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
write
(
error_unit
,
*
)
'Time tridiag_real :'
,
ttt1
-
ttt0
time_evp_fwd
=
ttt1
-
ttt0
...
...
src/elpa1_compute.F90
View file @
d9011373
This diff is collapsed.
Click to expand it.
src/elpa2.F90
View file @
d9011373
...
...
@@ -80,6 +80,8 @@ module ELPA2
! use cuda_c_kernel
! use iso_c_binding
#endif
use
elpa_mpi
implicit
none
PRIVATE
! By default, all routines contained are private
...
...
@@ -88,7 +90,6 @@ module ELPA2
public
::
solve_evp_real_2stage
public
::
solve_evp_complex_2stage
include
'mpif.h'
!******
contains
...
...
@@ -177,6 +178,7 @@ contains
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"solve_evp_real_2stage"
)
#endif
call
mpi_comm_rank
(
mpi_comm_all
,
my_pe
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_all
,
n_pes
,
mpierr
)
...
...
@@ -295,7 +297,7 @@ contains
ttts
=
ttt0
call
bandred_real
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
print
*
,
"hier 1:"
,
q
(
10
,
10
)
if
(
.not.
(
success
))
return
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
&
...
...
@@ -312,10 +314,14 @@ contains
ttt0
=
MPI_Wtime
()
call
tridiag_band_real
(
na
,
nbw
,
nblk
,
a
,
lda
,
ev
,
e
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
mpi_comm_all
)
print
*
,
"hier 2:"
,
q
(
10
,
10
)
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
&
write
(
error_unit
,
*
)
'Time tridiag_band_real :'
,
ttt1
-
ttt0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call
mpi_bcast
(
ev
,
na
,
MPI_REAL8
,
0
,
mpi_comm_all
,
mpierr
)
call
mpi_bcast
(
e
,
na
,
MPI_REAL8
,
0
,
mpi_comm_all
,
mpierr
)
...
...
@@ -324,6 +330,7 @@ contains
call
mpi_bcast
(
e
,
na
,
MPI_REAL4
,
0
,
mpi_comm_all
,
mpierr
)
#endif
#endif /* WITH_MPI */
ttt1
=
MPI_Wtime
()
time_evp_fwd
=
ttt1
-
ttts
...
...
@@ -332,6 +339,8 @@ contains
ttt0
=
MPI_Wtime
()
call
solve_tridi
(
na
,
nev
,
ev
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
&
mpi_comm_cols
,
wantDebug
,
success
)
print
*
,
"hier 3:"
,
q
(
10
,
10
)
if
(
.not.
(
success
))
return
ttt1
=
MPI_Wtime
()
...
...
@@ -348,9 +357,12 @@ contains
! Backtransform stage 1
ttt0
=
MPI_Wtime
()
print
*
,
"hier 4a:"
,
q
(
10
,
10
)
call
trans_ev_tridi_to_band_real
(
na
,
nev
,
nblk
,
nbw
,
q
,
ldq
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
wantDebug
,
useGPU
,
success
,
&
THIS_REAL_ELPA_KERNEL
)
print
*
,
"hier 4:"
,
q
(
10
,
10
)
if
(
.not.
(
success
))
return
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
&
...
...
@@ -369,6 +381,8 @@ contains
ttt0
=
MPI_Wtime
()
call
trans_ev_band_to_full_real
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
q
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
print
*
,
"hier 5:"
,
q
(
10
,
10
)
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
&
write
(
error_unit
,
*
)
'Time trans_ev_band_to_full_real :'
,
ttt1
-
ttt0
...
...
@@ -461,6 +475,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"solve_evp_complex_2stage"
)
#endif
call
mpi_comm_rank
(
mpi_comm_all
,
my_pe
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_all
,
n_pes
,
mpierr
)
...
...
@@ -545,6 +560,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call
bandred_complex
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
)
if
(
.not.
(
success
))
then
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
()
#endif
...
...
@@ -566,10 +582,13 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
ttt0
=
MPI_Wtime
()
call
tridiag_band_complex
(
na
,
nbw
,
nblk
,
a
,
lda
,
ev
,
e
,
matrixCols
,
hh_trans_complex
,
&
mpi_comm_rows
,
mpi_comm_cols
,
mpi_comm_all
)
ttt1
=
MPI_Wtime
()
if
(
my_prow
==
0
.and.
my_pcol
==
0
.and.
elpa_print_times
)
&
write
(
error_unit
,
*
)
'Time tridiag_band_complex :'
,
ttt1
-
ttt0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call
mpi_bcast
(
ev
,
na
,
mpi_real8
,
0
,
mpi_comm_all
,
mpierr
)
call
mpi_bcast
(
e
,
na
,
mpi_real8
,
0
,
mpi_comm_all
,
mpierr
)
...
...
@@ -577,6 +596,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
call
mpi_bcast
(
ev
,
na
,
mpi_real4
,
0
,
mpi_comm_all
,
mpierr
)
call
mpi_bcast
(
e
,
na
,
mpi_real4
,
0
,
mpi_comm_all
,
mpierr
)
#endif
#endif /* WITH_MPI */
ttt1
=
MPI_Wtime
()
time_evp_fwd
=
ttt1
-
ttts
...
...
@@ -631,8 +652,6 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
stop
endif
! Backtransform stage 2
ttt0
=
MPI_Wtime
()
...
...
src/elpa2_compute.F90
View file @
d9011373
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_bgp.f90
View file @
d9011373
...
...
@@ -110,9 +110,9 @@
subroutine
hh_trafo_kernel_10_bgp
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
use
precision
use
elpa_mpi
implicit
none
include
'mpif.h'
integer
(
kind
=
ik
),
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
(
kind
=
ck
),
intent
(
inout
)
::
q
(
ldq
/
2
,
*
)
...
...
@@ -387,9 +387,9 @@
subroutine
hh_trafo_kernel_8_bgp
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
use
precision
use
elpa_mpi
implicit
none
include
'mpif.h'
integer
(
kind
=
ik
),
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
(
kind
=
ck
),
intent
(
inout
)
::
q
(
ldq
/
2
,
*
)
...
...
@@ -629,9 +629,9 @@
subroutine
hh_trafo_kernel_4_bgp
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
use
precision
use
elpa_mpi
implicit
none
include
'mpif.h'
integer
(
kind
=
ik
),
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
(
kind
=
ck
),
intent
(
inout
)
::
q
(
ldq
/
2
,
*
)
...
...
src/elpa_qr/elpa_pdgeqrf.F90
View file @
d9011373
...
...
@@ -48,7 +48,7 @@ module elpa_pdgeqrf
use
elpa1_compute
use
elpa_pdlarfb
use
qr_utils_mod
use
elpa_mpi
implicit
none
PRIVATE
...
...
@@ -57,7 +57,6 @@ module elpa_pdgeqrf
public
::
qr_pqrparam_init
public
::
qr_pdlarfg2_1dcomm_check
include
'mpif.h'
contains
...
...
@@ -120,7 +119,6 @@ module elpa_pdgeqrf
! copy value before we are going to filter it
total_cols
=
n
call
mpi_comm_rank
(
mpicomm_cols
,
mpirank_cols
,
mpierr
)
call
mpi_comm_rank
(
mpicomm_rows
,
mpirank_rows
,
mpierr
)
call
mpi_comm_size
(
mpicomm_cols
,
mpiprocs_cols
,
mpierr
)
...
...
@@ -235,9 +233,10 @@ module elpa_pdgeqrf
!end if
! initiate broadcast (send part)
#ifdef WITH_MPI
call
MPI_Bcast
(
work
(
broadcast_offset
),
broadcast_size
,
mpi_real8
,
&
mpirank_cols_qr
,
mpicomm_cols
,
mpierr
)
#endif
! copy tau parts into temporary tau buffer
work
(
temptau_offset
+
voffset
-1
:
temptau_offset
+
(
voffset
-1
)
+
lcols
-1
)
=
tau
(
offset
:
offset
+
lcols
-1
)
...
...
@@ -257,9 +256,10 @@ module elpa_pdgeqrf
broadcast_size
=
dbroadcast_size
(
1
)
+
dtmat_bcast_size
(
1
)
! initiate broadcast (recv part)
#ifdef WITH_MPI
call
MPI_Bcast
(
work
(
broadcast_offset
),
broadcast_size
,
mpi_real8
,
&
mpirank_cols_qr
,
mpicomm_cols
,
mpierr
)
#endif
! last n*n elements in buffer are (still empty) T matrix elements
! fetch from first process in each column
...
...
@@ -530,10 +530,8 @@ module elpa_pdgeqrf
maxrank
=
min
(
PQRPARAM
(
1
),
n
)
updatemode
=
PQRPARAM
(
2
)
hgmode
=
PQRPARAM
(
4
)
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
if
(
trans
.eq.
1
)
then
incx
=
lda
else
...
...
@@ -713,10 +711,8 @@ module elpa_pdgeqrf
#endif
return
end
if
call
MPI_Comm_rank
(
mpi_comm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpi_comm
,
mpiprocs
,
mpierr
)
! calculate expected work size and store in work(1)
if
(
hgmode
.eq.
ichar
(
's'
))
then
! allreduce (MPI_SUM)
...
...
@@ -770,11 +766,13 @@ module elpa_pdgeqrf
work
(
1
)
=
alpha
work
(
2
)
=
dot
#ifdef WITH_MPI
call
mpi_allreduce
(
work
(
1
),
work
(
sendsize
+1
),
&
sendsize
,
mpi_real8
,
mpi_sum
,
&
mpi_comm
,
mpierr
)
#else
work
(
sendsize
+1
:
sendsize
+1
+
sendsize
-1
)
=
work
(
1
:
sendsize
)
#endif
alpha
=
work
(
sendsize
+1
)
xnorm
=
sqrt
(
work
(
sendsize
+2
))
else
if
(
hgmode
.eq.
ichar
(
'x'
))
then
...
...
@@ -790,11 +788,13 @@ module elpa_pdgeqrf
work
(
2
*
iproc
+1
)
=
alpha
work
(
2
*
iproc
+2
)
=
xnorm
end
do
#ifdef WITH_MPI
call
mpi_alltoall
(
work
(
1
),
2
,
mpi_real8
,
&
work
(
sendsize
+1
),
2
,
mpi_real8
,
&
mpi_comm
,
mpierr
)
#else
work
(
sendsize
+1
:
sendsize
+1+2-1
)
=
work
(
1
:
2
)
#endif
! extract alpha value
alpha
=
work
(
sendsize
+1
+
mpirank_top
*
2
)
...
...
@@ -816,10 +816,13 @@ module elpa_pdgeqrf
work
(
2
)
=
xnorm
! allgather
#ifdef WITH_MPI
call
mpi_allgather
(
work
(
1
),
sendsize
,
mpi_real8
,
&
work
(
sendsize
+1
),
sendsize
,
mpi_real8
,
&
mpi_comm
,
mpierr
)
#else
work
(
sendsize
+1
:
sendsize
+1
+
sendsize
-1
)
=
work
(
1
:
sendsize
)
#endif
! extract alpha value
alpha
=
work
(
sendsize
+1
+
mpirank_top
*
2
)
...
...
@@ -1040,10 +1043,8 @@ module elpa_pdgeqrf
#endif
return
end
if
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
call
local_size_offset_1d
(
n
,
nb
,
idx
,
idx
-1
,
rev
,
mpirank
,
mpiprocs
,
&
local_size1
,
baseoffset
,
local_offset1
)
...
...
@@ -1088,8 +1089,13 @@ module elpa_pdgeqrf
work
(
8
)
=
0.0d0
! fill up buffer
! exchange partial results
#ifdef WITH_MPI
call
mpi_allreduce
(
work
,
seed
,
8
,
mpi_real8
,
mpi_sum
,
&
mpicomm
,
mpierr
)
#else
seed
(
1
:
8
)
=
work
(
1
:
8
)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"qr_pdlarfg2_1dcomm_seed"
)
#endif
...
...
@@ -1188,10 +1194,8 @@ module elpa_pdgeqrf
call
timer
%
start
(
"qr_pdlarfg2_1dcomm_vector"
)
#endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
call
local_size_offset_1d
(
n
,
nb
,
idx
,
idx
-1
,
rev
,
mpirank
,
mpiprocs
,
&
local_size
,
baseoffset
,
local_offset
)
...
...
@@ -1269,10 +1273,10 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"qr_pdlarfg2_1dcomm_update"
)
#endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
! seed should be updated by previous householder generation
! Update inner product of this column and next column vector
top11
=
seed
(
1
)
...
...
@@ -1503,9 +1507,9 @@ module elpa_pdgeqrf
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"qr_pdlarfgk_1dcomm_seed"
)
#endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
C_size
=
k
*
k
D_size
=
k
*
k
sendoffset
=
1
...
...
@@ -1571,9 +1575,12 @@ module elpa_pdgeqrf
! TODO: store symmetric part more efficiently
! allreduce operation on results
#ifdef WITH_MPI
call
mpi_allreduce
(
work
(
sendoffset
),
work
(
recvoffset
),
sendrecv_size
,
&
mpi_real8
,
mpi_sum
,
mpicomm
,
mpierr
)
#else
work
(
recvoffset
:
recvoffset
+
sendrecv_size
-1
)
=
work
(
sendoffset
:
sendoffset
+
sendrecv_size
-1
)
#endif
! unpack result from buffer into seedC and seedD
seedC
(
1
:
k
,
1
:
k
)
=
0.0d0
do
icol
=
1
,
k
...
...
@@ -1918,7 +1925,6 @@ module elpa_pdgeqrf
#endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
lidx
=
baseidx
-
sidx
+1
call
local_size_offset_1d
(
n
,
nb
,
baseidx
,
lidx
-1
,
rev
,
mpirank
,
mpiprocs
,
&
local_size
,
baseoffset
,
local_offset
)
...
...
@@ -2024,7 +2030,6 @@ module elpa_pdgeqrf
endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
lidx
=
baseidx
-
sidx
if
(
lidx
.lt.
1
)
then
#ifdef HAVE_DETAILED_TIMINGS
...
...
@@ -2180,7 +2185,6 @@ module elpa_pdgeqrf
#endif
call
mpi_comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
mpi_comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
call
local_size_offset_1d
(
m
,
mb
,
baseidx
,
rowidx
,
rev
,
mpirank
,
mpiprocs
,
&
local_size
,
baseoffset
,
offset
)
...
...
@@ -2385,7 +2389,6 @@ module elpa_pdgeqrf
#endif
call
MPI_Comm_rank
(
mpicomm
,
mpirank
,
mpierr
)
call
MPI_Comm_size
(
mpicomm
,
mpiprocs
,
mpierr
)
call
local_size_offset_1d
(
n
,
nb
,
baseidx
,
idx
,
rev
,
mpirank
,
mpiprocs
,
&
local_size
,
v_offset
,
x_offset
)
v_offset
=
v_offset
*
incv
...
...
src/elpa_qr/elpa_pdlarfb.
f
90
→
src/elpa_qr/elpa_pdlarfb.
F
90
View file @
d9011373