elpa_reduce_add_vectors.F90 7.74 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 16 17 18 19 20
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
21
!    http://elpa.mpcdf.mpg.de/
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
43
! Author: Andreas Marek, MPCDF
44 45
#endif

46
#include "config-f90.h"
47
#include "../general/sanity.F90"
48
#include "../general/error_checking.inc"
49

Andreas Marek's avatar
Cleanup  
Andreas Marek committed
50 51 52 53
subroutine elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
            (obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvr, nvc, nblk, nrThreads)
55

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
!-------------------------------------------------------------------------------
! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
! The result of the reduce is gathered on the processors owning the diagonal
! and added to the array of vectors vmat_t (which is distributed over comm_t).
!
! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
! in the different members within vmat_t (else a reduce wouldn't be necessary).
! After this routine, an allreduce of vmat_t has to be done.
!
! vmat_s    array of vectors to be reduced and added
! ld_s      leading dimension of vmat_s
! comm_s    communicator over which vmat_s is distributed
! vmat_t    array of vectors to which vmat_s is added
! ld_t      leading dimension of vmat_t
! comm_t    communicator over which vmat_t is distributed
! nvr       global length of vmat_s/vmat_t
! nvc       number of columns in vmat_s/vmat_t
! nblk      block size of block cyclic distribution
!
!-------------------------------------------------------------------------------

Andreas Marek's avatar
Andreas Marek committed
77
   use precision
Andreas Marek's avatar
Andreas Marek committed
78 79 80
#ifdef WITH_OPENMP
   use omp_lib
#endif
81
   use elpa_mpi
82
   use elpa_abstract_impl
83 84
   implicit none

Andreas Marek's avatar
Andreas Marek committed
85
   class(elpa_abstract_impl_t), intent(inout)         :: obj
86 87 88
   integer(kind=ik), intent(in)                       :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in)    :: vmat_s(ld_s,nvc)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: vmat_t(ld_t,nvc)
89

90 91
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable   :: aux1(:), aux2(:)
   integer(kind=ik)                                   :: myps, mypt, nps, npt
92 93 94
   integer(kind=MPI_KIND)                             :: mypsMPI, npsMPI, myptMPI, nptMPI
   integer(kind=ik)                                   :: n, lc, k, i, ips, ipt, ns, nl
   integer(kind=MPI_KIND)                             :: mpierr
95
   integer(kind=ik)                                   :: lcm_s_t, nblks_tot
Andreas Marek's avatar
Andreas Marek committed
96
   integer(kind=ik)                                   :: auxstride
Andreas Marek's avatar
Andreas Marek committed
97
   integer(kind=ik), intent(in)                       :: nrThreads
98 99
   integer(kind=ik)                                   :: istat
   character(200)                                     :: errorMessage
100

101
   call obj%timer%start("elpa_reduce_add_vectors_&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
102 103 104 105
   &MATH_DATATYPE&
   &" // &
   &PRECISION_SUFFIX &
   )
106

107
   call obj%timer%start("mpi_communication")
108 109 110 111 112 113 114 115 116
   call mpi_comm_rank(int(comm_s,kind=MPI_KIND), mypsMPI, mpierr)
   call mpi_comm_size(int(comm_s,kind=MPI_KIND), npsMPI,  mpierr)
   call mpi_comm_rank(int(comm_t,kind=MPI_KIND), myptMPI, mpierr)
   call mpi_comm_size(int(comm_t,kind=MPI_KIND), nptMPI ,mpierr)
   myps = int(mypsMPI,kind=c_int)
   nps = int(npsMPI,kind=c_int)
   mypt = int(myptMPI,kind=c_int)
   npt = int(nptMPI,kind=c_int)

117
   call obj%timer%stop("mpi_communication")
118 119 120 121 122 123 124 125 126 127

   ! Look to elpa_transpose_vectors for the basic idea!

   ! The communictation pattern repeats in the global matrix after
   ! the least common multiple of (nps,npt) blocks

   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt

   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr

128 129 130 131 132
   allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
   check_allocate("elpa_reduce_add: aux1", istat, errorMessage)

   allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
   check_allocate("elpa_reduce_add: aux2", istat, errorMessage)
133 134
   aux1(:) = 0
   aux2(:) = 0
Andreas Marek's avatar
Andreas Marek committed
135
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
136
   !call omp_set_num_threads(nrThreads)
Andreas Marek's avatar
Andreas Marek committed
137

Andreas Marek's avatar
Andreas Marek committed
138
   !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
Andreas Marek's avatar
Andreas Marek committed
139
#endif
140 141 142 143 144
   do n = 0, lcm_s_t-1

      ips = mod(n,nps)
      ipt = mod(n,npt)

Andreas Marek's avatar
Andreas Marek committed
145 146
      auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)

147 148
      if(myps == ips) then

Andreas Marek's avatar
Andreas Marek committed
149 150 151 152
!         k = 0
#ifdef WITH_OPENMP
         !$omp do
#endif
153 154
         do lc=1,nvc
            do i = n, nblks_tot-1, lcm_s_t
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
155
               k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
156 157 158
               ns = (i/nps)*nblk ! local start of block i
               nl = min(nvr-i*nblk,nblk) ! length
               aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
Andreas Marek's avatar
Andreas Marek committed
159
!               k = k+nblk
160 161 162
            enddo
         enddo

Andreas Marek's avatar
Andreas Marek committed
163 164 165 166 167
         k = nvc * auxstride
#ifdef WITH_OPENMP
         !$omp barrier
         !$omp master
#endif
168

169
#ifdef WITH_MPI
170
         call obj%timer%start("mpi_communication")
171

Andreas Marek's avatar
Cleanup  
Andreas Marek committed
172 173 174
         if (k>0) call mpi_reduce(aux1, aux2, k, &
#if REALCASE == 1
                                  MPI_REAL_PRECISION,  &
175
#endif
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
176 177
#if COMPLEXCASE == 1
                                  MPI_COMPLEX_PRECISION, &
178
#endif
179
                                  MPI_SUM, int(ipt,kind=MPI_KIND), int(comm_t,kind=MPI_KIND), mpierr)
Andreas Marek's avatar
Andreas Marek committed
180

181
          call obj%timer%stop("mpi_communication")
182

183 184 185
#else /* WITH_MPI */
         if(k>0) aux2 = aux1
#endif /* WITH_MPI */
186

Andreas Marek's avatar
Andreas Marek committed
187 188 189 190 191 192 193 194 195
#ifdef WITH_OPENMP
         !$omp end master
         !$omp barrier
#endif
         if (mypt == ipt) then
!            k = 0
#ifdef WITH_OPENMP
         !$omp do
#endif
196 197
            do lc=1,nvc
               do i = n, nblks_tot-1, lcm_s_t
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
198
                  k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
199 200 201
                  ns = (i/npt)*nblk ! local start of block i
                  nl = min(nvr-i*nblk,nblk) ! length
                  vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
Andreas Marek's avatar
Andreas Marek committed
202
!                  k = k+nblk
203 204 205 206 207 208 209
               enddo
            enddo
         endif

      endif

   enddo
Andreas Marek's avatar
Andreas Marek committed
210 211 212
#ifdef WITH_OPENMP
   !$omp end parallel
#endif
213

214 215
   deallocate(aux1, aux2, stat=istat, errmsg=errorMessage)
   check_deallocate("elpa_reduce_add: aux1, aux2", istat, errorMessage)
216

217
   call obj%timer%stop("elpa_reduce_add_vectors_&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
218 219 220 221
   &MATH_DATATYPE&
   &" // &
   &PRECISION_SUFFIX &
   )
222 223 224
end subroutine