elpa_transpose_vectors.X90 7.28 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
21
!    http://elpa.mpcdf.mpg.de/
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52 53
#include "config-f90.h"

54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#if REALCASE==1
subroutine elpa_transpose_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
#endif
#if COMPLEXCASE==1
subroutine elpa_transpose_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
#endif

!-------------------------------------------------------------------------------
! This routine transposes an array of vectors which are distributed in
! communicator comm_s into its transposed form distributed in communicator comm_t.
! There must be an identical copy of vmat_s in every communicator comm_s.
! After this routine, there is an identical copy of vmat_t in every communicator comm_t.
!
! vmat_s    original array of vectors
! ld_s      leading dimension of vmat_s
! comm_s    communicator over which vmat_s is distributed
! vmat_t    array of vectors in transposed form
! ld_t      leading dimension of vmat_t
! comm_t    communicator over which vmat_t is distributed
! nvs       global index where to start in vmat_s/vmat_t
!           Please note: this is kind of a hint, some values before nvs will be
!           accessed in vmat_s/put into vmat_t
! nvr       global length of vmat_s/vmat_t
! nvc       number of columns in vmat_s/vmat_t
! nblk      block size of block cyclic distribution
!
!-------------------------------------------------------------------------------
Andreas Marek's avatar
Andreas Marek committed
81
    use precision
82 83

!   use ELPA1 ! for least_common_multiple
Andreas Marek's avatar
Andreas Marek committed
84 85 86
#ifdef WITH_OPENMP
   use omp_lib
#endif
87 88 89 90 91

   implicit none

   include 'mpif.h'

Andreas Marek's avatar
Andreas Marek committed
92 93 94
   integer(kind=ik), intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
   DATATYPE, intent(in)                      :: vmat_s(ld_s,nvc)
   DATATYPE, intent(inout)                   :: vmat_t(ld_t,nvc)
95

Andreas Marek's avatar
Andreas Marek committed
96 97 98 99 100
   DATATYPE, allocatable                     :: aux(:)
   integer(kind=ik)                          :: myps, mypt, nps, npt
   integer(kind=ik)                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
   integer(kind=ik)                          :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
   integer(kind=ik)                          :: auxstride
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124

   call mpi_comm_rank(comm_s,myps,mpierr)
   call mpi_comm_size(comm_s,nps ,mpierr)
   call mpi_comm_rank(comm_t,mypt,mpierr)
   call mpi_comm_size(comm_t,npt ,mpierr)

   ! The basic idea of this routine is that for every block (in the block cyclic
   ! distribution), the processor within comm_t which owns the diagonal
   ! broadcasts its values of vmat_s to all processors within comm_t.
   ! Of course this has not to be done for every block separately, since
   ! the communictation pattern repeats in the global matrix after
   ! the least common multiple of (nps,npt) blocks

   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt

   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr

   ! Get the number of blocks to be skipped at the begin.
   ! This must be a multiple of lcm_s_t (else it is getting complicated),
   ! thus some elements before nvs will be accessed/set.

   nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t

   allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
Andreas Marek's avatar
Andreas Marek committed
125
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
126 127
   !$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
128 129 130 131 132 133 134 135
   do n = 0, lcm_s_t-1

      ips = mod(n,nps)
      ipt = mod(n,npt)

      if(mypt == ipt) then

         nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
Andreas Marek's avatar
Andreas Marek committed
136 137 138
         auxstride = nblk * nblks_comm
!         if(nblks_comm==0) cycle
         if (nblks_comm .ne. 0) then
139
         if(myps == ips) then
Andreas Marek's avatar
Andreas Marek committed
140 141 142 143
!            k = 0
#ifdef WITH_OPENMP
            !$omp do
#endif
144 145
            do lc=1,nvc
               do i = nblks_skip+n, nblks_tot-1, lcm_s_t
Andreas Marek's avatar
Andreas Marek committed
146
                  k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
147 148 149
                  ns = (i/nps)*nblk ! local start of block i
                  nl = min(nvr-i*nblk,nblk) ! length
                  aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
Andreas Marek's avatar
Andreas Marek committed
150
!                  k = k+nblk
151 152 153 154
               enddo
            enddo
         endif

Andreas Marek's avatar
Andreas Marek committed
155 156 157 158
#ifdef WITH_OPENMP
         !$omp barrier
         !$omp master
#endif
159
#if COMPLEXCASE==1
160 161 162 163 164

#ifdef DOUBLE_PRECISION_COMPLEX
         call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_DOUBLE_COMPLEX, ips, comm_s, mpierr)
#else
         call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_COMPLEX, ips, comm_s, mpierr)
165 166
#endif

167 168
#endif /* DOUBLE_PRECISION_COMPLEX */

169
#if REALCASE==1
170 171 172 173 174

#ifdef DOUBLE_PRECISION_REAL
         call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_REAL8, ips, comm_s, mpierr)
#else
         call MPI_Bcast(aux, nblks_comm*nblk*nvc, MPI_REAL4, ips, comm_s, mpierr)
175
#endif
176 177 178

#endif /* REALCASE == 1 */

Andreas Marek's avatar
Andreas Marek committed
179 180 181 182 183 184 185
#ifdef WITH_OPENMP
         !$omp end master
         !$omp barrier

         !$omp do
#endif
!         k = 0
186 187
         do lc=1,nvc
            do i = nblks_skip+n, nblks_tot-1, lcm_s_t
Andreas Marek's avatar
Andreas Marek committed
188
               k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
189 190 191
               ns = (i/npt)*nblk ! local start of block i
               nl = min(nvr-i*nblk,nblk) ! length
               vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
Andreas Marek's avatar
Andreas Marek committed
192
!               k = k+nblk
193 194
            enddo
         enddo
Andreas Marek's avatar
Andreas Marek committed
195
         endif
196 197 198
      endif

   enddo
Andreas Marek's avatar
Andreas Marek committed
199 200 201
#ifdef WITH_OPENMP
   !$omp end parallel
#endif
202 203 204 205
   deallocate(aux)

end subroutine