redist_band.F90 8.74 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15
16
17
18
19
20
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
21
!    http://elpa.mpcdf.mpg.de/
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
43
! Author: Andreas Marek, MPCDF
44
#endif
45
46
! --------------------------------------------------------------------------------------------------
! redist_band: redistributes band from 2D block cyclic form to 1D band
47
48
49

#include "config-f90.h"

Andreas Marek's avatar
Cleanup    
Andreas Marek committed
50
51
52
53
subroutine redist_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
Pavel Kus's avatar
Pavel Kus committed
54
           (obj, a_mat, a_dev, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
55

56
   use elpa_abstract_impl
57
   use elpa2_workload
Andreas Marek's avatar
Andreas Marek committed
58
   use precision
Andreas Marek's avatar
Andreas Marek committed
59
   use iso_c_binding
60
   use cuda_functions
61
62
   use elpa_utilities, only : local_index
   use elpa_mpi
63
64
   implicit none

Andreas Marek's avatar
Andreas Marek committed
65
66
67
   class(elpa_abstract_impl_t), intent(inout)       :: obj
   logical, intent(in)                              :: useGPU
   integer(kind=ik), intent(in)                     :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
Pavel Kus's avatar
Pavel Kus committed
68
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in)  :: a_mat(lda, matrixCols)
69
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(out) :: ab(:,:)
70

Andreas Marek's avatar
Andreas Marek committed
71
72
   integer(kind=ik), allocatable                    :: ncnt_s(:), nstart_s(:), ncnt_r(:), nstart_r(:), &
                                                       global_id(:,:), global_id_tmp(:,:), block_limits(:)
73
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: sbuf(:,:,:), rbuf(:,:,:), buf(:,:)
74

Andreas Marek's avatar
Andreas Marek committed
75
76
77
   integer(kind=ik)                                 :: i, j, my_pe, n_pes, my_prow, np_rows, my_pcol, np_cols, &
                                                       nfact, np, npr, npc, mpierr, is, js
   integer(kind=ik)                                 :: nblocks_total, il, jl, l_rows, l_cols, n_off
78

Andreas Marek's avatar
Andreas Marek committed
79
80
   logical                                          :: successCUDA
   integer(kind=c_intptr_t)                         :: a_dev
81
   integer(kind=c_intptr_t), parameter              :: size_of_datatype = size_of_&
82
83
84
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
85

86
   call obj%timer%start("redist_band_&
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
87
88
89
90
   &MATH_DATATYPE&
   &" // &
   &PRECISION_SUFFIX &
   )
91

92

93
   call obj%timer%start("mpi_communication")
94
95
   call mpi_comm_rank(communicator,my_pe,mpierr)
   call mpi_comm_size(communicator,n_pes,mpierr)
96
97
98
99
100

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
101
   call obj%timer%stop("mpi_communication")
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
102

103
104
105
106
107
108
109
110
   ! Get global_id mapping 2D procssor coordinates to global id

   allocate(global_id(0:np_rows-1,0:np_cols-1))
#ifdef WITH_OPENMP
   allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
#endif
   global_id(:,:) = 0
   global_id(my_prow, my_pcol) = my_pe
111
#ifdef WITH_MPI
112
   call obj%timer%start("mpi_communication")
113
114
#ifdef WITH_OPENMP
   global_id_tmp(:,:) = global_id(:,:)
115
   call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
116
117
   deallocate(global_id_tmp)
#else
118
   call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
119
#endif
120
   call obj%timer%stop("mpi_communication")
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
121
#endif /* WITH_MPI */
122
123
124
125
126
   ! Set work distribution

   nblocks_total = (na-1)/nbw + 1

   allocate(block_limits(0:n_pes))
127
   call divide_band(obj, nblocks_total, n_pes, block_limits)
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154


   allocate(ncnt_s(0:n_pes-1))
   allocate(nstart_s(0:n_pes-1))
   allocate(ncnt_r(0:n_pes-1))
   allocate(nstart_r(0:n_pes-1))


   nfact = nbw/nblk

   ! Count how many blocks go to which PE

   ncnt_s(:) = 0
   np = 0 ! receiver PE number
   do j=0,(na-1)/nblk ! loop over rows of blocks
     if (j/nfact==block_limits(np+1)) np = np+1
     if (mod(j,np_rows) == my_prow) then
       do i=0,nfact
         if (mod(i+j,np_cols) == my_pcol) then
           ncnt_s(np) = ncnt_s(np) + 1
         endif
       enddo
     endif
   enddo

   ! Allocate send buffer

155
156
   allocate(sbuf(nblk,nblk,sum(ncnt_s)))
   sbuf(:,:,:) = 0.
157
158
159
160
161
162
163
164
165
166

   ! Determine start offsets in send buffer

   nstart_s(0) = 0
   do i=1,n_pes-1
     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
   enddo

   ! Fill send buffer

Pavel Kus's avatar
Pavel Kus committed
167
168
   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of a_mat
169
170
171
172
173
174
175
176
177
178
179
180
181

   np = 0
   do j=0,(na-1)/nblk ! loop over rows of blocks
     if (j/nfact==block_limits(np+1)) np = np+1
     if (mod(j,np_rows) == my_prow) then
       do i=0,nfact
         if (mod(i+j,np_cols) == my_pcol) then
           nstart_s(np) = nstart_s(np) + 1
           js = (j/np_rows)*nblk
           is = ((i+j)/np_cols)*nblk
           jl = MIN(nblk,l_rows-js)
           il = MIN(nblk,l_cols-is)

Pavel Kus's avatar
Pavel Kus committed
182
           sbuf(1:jl,1:il,nstart_s(np)) = a_mat(js+1:js+jl,is+1:is+il)
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
         endif
       enddo
      endif
   enddo

   ! Count how many blocks we get from which PE

   ncnt_r(:) = 0
   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
     npr = mod(j,np_rows)
     do i=0,nfact
       npc = mod(i+j,np_cols)
       np = global_id(npr,npc)
       ncnt_r(np) = ncnt_r(np) + 1
     enddo
   enddo

   ! Allocate receive buffer

202
   allocate(rbuf(nblk,nblk,sum(ncnt_r)))
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

   ! Set send counts/send offsets, receive counts/receive offsets
   ! now actually in variables, not in blocks

   ncnt_s(:) = ncnt_s(:)*nblk*nblk

   nstart_s(0) = 0
   do i=1,n_pes-1
     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
   enddo

   ncnt_r(:) = ncnt_r(:)*nblk*nblk

   nstart_r(0) = 0
   do i=1,n_pes-1
     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
   enddo

   ! Exchange all data with MPI_Alltoallv
222
#ifdef WITH_MPI
223
   call obj%timer%start("mpi_communication")
224

225
226
    call MPI_Alltoallv(sbuf, ncnt_s, nstart_s, MPI_MATH_DATATYPE_PRECISION_EXPL, &
                       rbuf, ncnt_r, nstart_r, MPI_MATH_DATATYPE_PRECISION_EXPL, communicator, mpierr)
227

228
    call obj%timer%stop("mpi_communication")
229
#else /* WITH_MPI */
230
    rbuf = sbuf
231
#endif /* WITH_MPI */
232
233

! set band from receive buffer
234
235
236
237
238
239
240
241

   ncnt_r(:) = ncnt_r(:)/(nblk*nblk)

   nstart_r(0) = 0
   do i=1,n_pes-1
     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
   enddo

242
   allocate(buf((nfact+1)*nblk,nblk))
243
244
245
246
247
248
249
250
251
252
253

   ! n_off: Offset of ab within band
   n_off = block_limits(my_pe)*nbw

   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
     npr = mod(j,np_rows)
     do i=0,nfact
       npc = mod(i+j,np_cols)
       np = global_id(npr,npc)
       nstart_r(np) = nstart_r(np) + 1
#if REALCASE==1
254
       buf(i*nblk+1:i*nblk+nblk,:) = transpose(rbuf(:,:,nstart_r(np)))
255
256
#endif
#if COMPLEXCASE==1
257
       buf(i*nblk+1:i*nblk+nblk,:) = conjg(transpose(rbuf(:,:,nstart_r(np))))
258
259
260
#endif
     enddo
     do i=1,MIN(nblk,na-j*nblk)
261
       ab(1:nbw+1,i+j*nblk-n_off) = buf(i:i+nbw,i)
262
263
264
265
266
267
268
269
     enddo
   enddo

   deallocate(ncnt_s, nstart_s)
   deallocate(ncnt_r, nstart_r)
   deallocate(global_id)
   deallocate(block_limits)

270
   deallocate(sbuf, rbuf, buf)
271

272
   call obj%timer%stop("redist_band_&
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
273
274
275
276
   &MATH_DATATYPE&
   &" // &
   &PRECISION_SUFFIX &
   )
277
278
279

end subroutine