Commit 043ddf39 authored by Andreas Marek's avatar Andreas Marek

Experimental feature: 64bit integer support for MPI

ELPA can now be linked against a 64bit integer version of MPI and
ScalaPack. This is an experimental feature

The following points are still to be done
- does not work with real QR-decomposition
- generalized routines return wrong results
- the C tests and the C Cannon algorithm implementation do not
  work (no 64bit header files for MPI *at least* with Intel MPI)
parent 09a8b7bb
......@@ -231,6 +231,10 @@ if test x"${enable_64bit_integer_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
dnl at least INTEL MPI does _NOT_ support 64BIT integer mode for C thus disable C tests in this Case
if test x"${enable_c_tests}" = x"yes"; then
AC_MSG_ERROR([You cannot both define 64bit integer support and C tests. Reconfigure!])
fi
dnl check whether long int is the correct data-type in C
if test x"${size_of_long_int}" = x"8"; then
echo "Found C data-type \"long int\" with 8 bytes"
......
......@@ -248,6 +248,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
name = "validate_autotune{langsuffix}_{d}_{p}".format(langsuffix=language_flag[lang], d=d, p=p)
print("if ENABLE_AUTOTUNING")
if lang == "C":
print("if ENABLE_C_TESTS")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
if lang == "Fortran":
......@@ -267,6 +269,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
domain_flag[d],
prec_flag[p]]))
print("endif\n" * endifs)
if lang == "C":
print("endif")
print("endif")
name = "validate_multiple_objs_real_double"
......@@ -282,6 +286,7 @@ print(" " + " \\\n ".join([
print("endif")
name = "validate_multiple_objs_real_double_c_version"
print("if ENABLE_C_TESTS")
print("if ENABLE_AUTOTUNING")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
......@@ -292,6 +297,8 @@ print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['double']]))
print("endif")
print("endif")
name = "validate_split_comm_real_double"
print("check_SCRIPTS += " + name + "_extended.sh")
......
......@@ -103,7 +103,9 @@
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr
np_cols
integer(kind=MPI_KIND) :: mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=BLAS_KIND) :: idxBLAS(NA)
......@@ -132,10 +134,16 @@
call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
! If my processor column isn't in the requested set, do nothing
......@@ -689,9 +697,9 @@
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL_PRECISION, &
np_next, 1111, np_prev, 1111, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call MPI_Sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=MPI_KIND), MPI_REAL_PRECISION, &
int(np_next,kind=MPI_KIND), 1111_MPI_KIND, int(np_prev,kind=MPI_KIND), &
1111_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
......@@ -934,7 +942,7 @@
&PRECISION&
&(obj, idx_ev, nLength)
use precision
use elpa_abstract_impl
use elpa_abstract_impl
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nLength
......@@ -975,14 +983,16 @@
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, mod(i,4096), mpi_comm_cols, mpierr)
call mpi_send(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, int(mod(i,4096),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL_PRECISION, pc1, mod(i,4096), mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(qtmp(1,nc), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, int(mod(i,4096),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
......@@ -1040,9 +1050,9 @@
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, 1, &
tmp, l_rows, MPI_REAL_PRECISION, pc2, 1, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_sendrecv(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, &
tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
......@@ -1052,9 +1062,9 @@
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL_PRECISION, pc1, 1, &
tmp, l_rows, MPI_REAL_PRECISION, pc1, 1, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_sendrecv(q(l_rqs,lc2), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, &
tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
......@@ -1085,7 +1095,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1100,7 +1110,7 @@
if (npc_n==np_cols) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1115,8 +1125,9 @@
z(:) = z(:) + tmp(:)
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Sendrecv_replace(z, n, MPI_REAL_PRECISION, np_next, 1111, np_prev, 1111, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call MPI_Sendrecv_replace(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_next,kind=MPI_KIND), 1111_MPI_KIND, &
int(np_prev,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
enddo
......@@ -1143,7 +1154,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_rows, mpierr)
call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1158,7 +1169,7 @@
if (npc_n==np_cols) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
z = tmp
......@@ -1174,7 +1185,8 @@
do np = npc_0+1, npc_0+npc_n-1
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(tmp, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:n) = z(1:n)
......@@ -1184,15 +1196,18 @@
do np = npc_0+1, npc_0+npc_n-1
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(z, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, mpierr)
call mpi_send(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
enddo
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(tmp, n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, mpierr)
call mpi_recv(z ,n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_send(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call mpi_recv(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
z(1:n) = tmp(1:n)
......
......@@ -75,8 +75,8 @@ subroutine solve_tridi_&
logical, intent(out) :: success
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
integer(kind=ik) :: istat
......@@ -93,10 +93,16 @@ subroutine solve_tridi_&
call obj%timer%start("solve_tridi" // PRECISION_SUFFIX // gpuString)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
success = .true.
......@@ -293,7 +299,8 @@ subroutine solve_tridi_&
call obj%timer%start("mpi_communication")
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
enddo
endif
call obj%timer%stop("mpi_communication")
......@@ -302,7 +309,8 @@ subroutine solve_tridi_&
if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(d(noff+1), nmid, MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off,kind=MPI_KIND), 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
......@@ -313,7 +321,8 @@ subroutine solve_tridi_&
do n=np_off,np_off+np1-1
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), &
1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -322,7 +331,8 @@ subroutine solve_tridi_&
if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off+np1,kind=MPI_KIND), &
1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
......@@ -335,7 +345,8 @@ subroutine solve_tridi_&
call merge_systems_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success, max_threads )
if (.not.(success)) return
else
......@@ -343,7 +354,8 @@ subroutine solve_tridi_&
call merge_systems_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success, max_threads )
if (.not.(success)) return
endif
......@@ -378,7 +390,8 @@ subroutine solve_tridi_&
real(kind=REAL_DATATYPE), allocatable :: qmat1(:,:), qmat2(:,:)
integer(kind=ik) :: i, n, np
integer(kind=ik) :: ndiv, noff, nmid, nlen, max_size
integer(kind=ik) :: my_prow, np_rows, mpierr
integer(kind=ik) :: my_prow, np_rows
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, np_rowsMPI
integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
logical, intent(in) :: useGPU, wantDebug
......@@ -390,8 +403,11 @@ subroutine solve_tridi_&
call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
success = .true.
! Calculate the number of subdivisions needed.
......@@ -497,9 +513,11 @@ subroutine solve_tridi_&
nlen = limits(np+1)-noff
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(d(noff+1), nlen, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(d(noff+1), int(nlen,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
qmat2 = qmat1
call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(qmat2, int(max_size*max_size,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! qmat2 = qmat1 ! is this correct
......@@ -559,7 +577,8 @@ subroutine solve_tridi_&
call merge_systems_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success, max_threads)
if (.not.(success)) return
......
......@@ -93,6 +93,7 @@ function elpa_solve_evp_&
complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:)
complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, np_rows, np_cols
integer(kind=MPI_KIND) :: np_rowsMPI, np_colsMPI
#endif /* COMPLEXCASE */
logical :: useGPU
......@@ -102,7 +103,8 @@ function elpa_solve_evp_&
do_useGPU_solve_tridi, do_useGPU_trans_ev
integer(kind=ik) :: numberOfGPUDevices
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol
integer(kind=MPI_KIND) :: mpierr, my_peMPI, n_pesMPI, my_prowMPI, my_pcolMPI
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
logical :: wantDebug
integer(kind=c_int) :: istat, debug, gpu
......@@ -204,12 +206,18 @@ function elpa_solve_evp_&
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
#if COMPLEXCASE == 1
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsmPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsmPI, mpierr)
np_rows = int(np_rowsMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
#endif
call obj%timer%stop("mpi_communication")
......@@ -230,7 +238,9 @@ function elpa_solve_evp_&
print *,"Problem getting option. Aborting..."
stop
endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
......
......@@ -286,7 +286,7 @@
if (wantDebug) call obj%timer%start("hh_transform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX )
&PRECISION_SUFFIX )
#if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=rk )
......
......@@ -115,7 +115,8 @@
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, nstor
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
......@@ -150,10 +151,15 @@
gpuString)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI, kind=c_int)
np_rows = int(np_rowsMPI, kind=c_int)
my_pcol = int(my_pcolMPI, kind=c_int)
np_cols = int(np_colsMPI, kind=c_int)
call obj%timer%stop("mpi_communication")
call obj%get("max_stored_rows",max_stored_rows_fac, error)
......@@ -271,7 +277,8 @@
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (nb>0) &
call MPI_Bcast(hvb, nb, MPI_MATH_DATATYPE_PRECISION , cur_pcol, mpi_comm_cols, mpierr)
call MPI_Bcast(hvb, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION , int(cur_pcol,kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -311,7 +318,8 @@
enddo
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
if (nc>0) call mpi_allreduce( h1, h2, int(nc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -396,7 +404,8 @@
check_memcpy_cuda("trans_ev", successCUDA)
endif
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, int(nstor*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
......
......@@ -124,7 +124,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=ik) :: mpierr
integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=MPI_KIND) :: mpierr
integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
max_local_cols
! updated after each istep (in the main cycle) to contain number of
......@@ -194,10 +195,15 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
my_prow = int(my_prowMPI, kind=c_int)
np_rows = int(np_rowsMPI, kind=c_int)
my_pcol = int(my_pcolMPI, kind=c_int)
np_cols = int(np_colsMPI, kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -409,8 +415,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1
......@@ -460,8 +466,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
! Broadcast the Householder Vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
call MPI_Bcast(v_row, int(l_rows+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(istep, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -697,8 +703,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
tmp(1:l_cols) = u_col(1:l_cols)
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
u_col = tmp
......@@ -719,7 +725,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -885,7 +891,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(prow(1, nblk, np_rows),kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -893,7 +900,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(pcol(2, nblk, np_cols),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -976,13 +984,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......
......@@ -60,7 +60,8 @@
#else
MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,obj%local_ncols)
#endif
integer(kind=ik) :: my_prow, my_pcol,