elpa_reduce_add_vectors.X90 6.36 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
21
!    http://elpa.mpcdf.mpg.de/
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#if REALCASE==1
subroutine elpa_reduce_add_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
#endif
#if COMPLEXCASE==1
subroutine elpa_reduce_add_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
#endif

!-------------------------------------------------------------------------------
! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
! The result of the reduce is gathered on the processors owning the diagonal
! and added to the array of vectors vmat_t (which is distributed over comm_t).
!
! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
! in the different members within vmat_t (else a reduce wouldn't be necessary).
! After this routine, an allreduce of vmat_t has to be done.
!
! vmat_s    array of vectors to be reduced and added
! ld_s      leading dimension of vmat_s
! comm_s    communicator over which vmat_s is distributed
! vmat_t    array of vectors to which vmat_s is added
! ld_t      leading dimension of vmat_t
! comm_t    communicator over which vmat_t is distributed
! nvr       global length of vmat_s/vmat_t
! nvc       number of columns in vmat_s/vmat_t
! nblk      block size of block cyclic distribution
!
!-------------------------------------------------------------------------------

!   use ELPA1 ! for least_common_multiple
Andreas Marek's avatar
Andreas Marek committed
81 82 83
#ifdef WITH_OPENMP
   use omp_lib
#endif
84 85 86 87
   implicit none

   include 'mpif.h'

Andreas Marek's avatar
Andreas Marek committed
88
   integer, intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
89 90 91
   DATATYPE*BYTESIZE, intent(in)    :: vmat_s(ld_s,nvc)
   DATATYPE*BYTESIZE, intent(inout) :: vmat_t(ld_t,nvc)

Andreas Marek's avatar
Andreas Marek committed
92 93 94 95 96
   DATATYPE*BYTESIZE, allocatable   :: aux1(:), aux2(:)
   integer                          :: myps, mypt, nps, npt
   integer                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
   integer                          :: lcm_s_t, nblks_tot
   integer                          :: auxstride, tylerk, error_unit
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115

   call mpi_comm_rank(comm_s,myps,mpierr)
   call mpi_comm_size(comm_s,nps ,mpierr)
   call mpi_comm_rank(comm_t,mypt,mpierr)
   call mpi_comm_size(comm_t,npt ,mpierr)

   ! Look to elpa_transpose_vectors for the basic idea!

   ! The communictation pattern repeats in the global matrix after
   ! the least common multiple of (nps,npt) blocks

   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt

   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr

   allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
   allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
   aux1(:) = 0
   aux2(:) = 0
Andreas Marek's avatar
Andreas Marek committed
116 117 118
#ifdef WITH_OPENMP
   !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
#endif
119 120 121 122 123
   do n = 0, lcm_s_t-1

      ips = mod(n,nps)
      ipt = mod(n,npt)

Andreas Marek's avatar
Andreas Marek committed
124 125
      auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)

126 127
      if(myps == ips) then

Andreas Marek's avatar
Andreas Marek committed
128 129 130 131
!         k = 0
#ifdef WITH_OPENMP
         !$omp do
#endif
132 133
         do lc=1,nvc
            do i = n, nblks_tot-1, lcm_s_t
Andreas Marek's avatar
Andreas Marek committed
134
	       k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
135 136 137
               ns = (i/nps)*nblk ! local start of block i
               nl = min(nvr-i*nblk,nblk) ! length
               aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
Andreas Marek's avatar
Andreas Marek committed
138
!               k = k+nblk
139 140 141
            enddo
         enddo

Andreas Marek's avatar
Andreas Marek committed
142 143 144 145 146
         k = nvc * auxstride
#ifdef WITH_OPENMP
         !$omp barrier
         !$omp master
#endif
147 148 149 150 151 152 153
#if REALCASE==1
         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr)
#endif

#if COMPLEXCASE==1
         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_DOUBLE_COMPLEX,MPI_SUM,ipt,comm_t,mpierr)
#endif
Andreas Marek's avatar
Andreas Marek committed
154 155 156 157 158 159 160 161 162 163

#ifdef WITH_OPENMP
         !$omp end master
         !$omp barrier
#endif
         if (mypt == ipt) then
!            k = 0
#ifdef WITH_OPENMP
         !$omp do
#endif
164 165
            do lc=1,nvc
               do i = n, nblks_tot-1, lcm_s_t
Andreas Marek's avatar
Andreas Marek committed
166
	          k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
167 168 169
                  ns = (i/npt)*nblk ! local start of block i
                  nl = min(nvr-i*nblk,nblk) ! length
                  vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
Andreas Marek's avatar
Andreas Marek committed
170
!                  k = k+nblk
171 172 173 174 175 176 177
               enddo
            enddo
         endif

      endif

   enddo
Andreas Marek's avatar
Andreas Marek committed
178 179 180
#ifdef WITH_OPENMP
   !$omp end parallel
#endif
181 182 183 184 185 186 187

   deallocate(aux1)
   deallocate(aux2)

end subroutine