elpa_cholesky_template.F90 11.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.

45
#include "../general/sanity.F90"
46
47
48
49
     use elpa1_compute
     use elpa_utilities
     use elpa_mpi
     use precision
50
     use elpa_abstract_impl
51
52
     use elpa_omp

53
     implicit none
Pavel Kus's avatar
Pavel Kus committed
54
#include "../general/precision_kinds.F90"
55
      class(elpa_abstract_impl_t), intent(inout) :: obj
56
      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
57
#ifdef USE_ASSUMED_SIZE
Pavel Kus's avatar
Pavel Kus committed
58
      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,*)
59
#else
Pavel Kus's avatar
Pavel Kus committed
60
      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,obj%local_ncols)
61
62
63
64
65
66
67
#endif
      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
      integer(kind=ik)              :: n, nc, i, info
      integer(kind=ik)              :: lcs, lce, lrs, lre
      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile

Pavel Kus's avatar
Pavel Kus committed
68
      MATH_DATATYPE(kind=rck), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
69
      logical                       :: wantDebug
70
      logical                       :: success
Andreas Marek's avatar
Andreas Marek committed
71
      integer(kind=ik)              :: istat, debug, error
72
      character(200)                :: errorMessage
73
      integer(kind=ik)              :: nrThreads
74

75
      call obj%timer%start("elpa_cholesky_&
76
77
      &MATH_DATATYPE&
      &_&
78
      &PRECISION&
79
      &")
80

Andreas Marek's avatar
Andreas Marek committed
81
#ifdef WITH_OPENMP
82
83
84
85
86
      ! store the number of OpenMP threads used in the calling function
      ! restore this at the end of ELPA 2
      omp_threads_caller = omp_get_max_threads()

      ! check the number of threads that ELPA should use internally
Andreas Marek's avatar
Andreas Marek committed
87
88
      call obj%get("omp_threads",nrThreads,error)
      call omp_set_num_threads(nrThreads)
Andreas Marek's avatar
Andreas Marek committed
89
#else
90
      nrThreads=1
Andreas Marek's avatar
Andreas Marek committed
91
92
#endif

93
94
95
96
97
      na         = obj%na
      lda        = obj%local_nrows
      nblk       = obj%nblk
      matrixCols = obj%local_ncols

Andreas Marek's avatar
Andreas Marek committed
98
99
100
101
102
103
104
105
106
107
      call obj%get("mpi_comm_rows",mpi_comm_rows,error )
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      call obj%get("mpi_comm_cols",mpi_comm_cols,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
108

Andreas Marek's avatar
Andreas Marek committed
109
110
111
112
113
      call obj%get("debug",debug,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
114
      if (debug == 1) then
115
116
117
118
119
        wantDebug = .true.
      else
        wantDebug = .false.
      endif

120
      call obj%timer%start("mpi_communication")
121
122
123
124
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
125
      call obj%timer%stop("mpi_communication")
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
      success = .true.

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
142
  &MATH_DATATYPE&: error when allocating tmp1 "//errorMessage
143
144
145
146
147
148
        stop 1
      endif

      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
149
150
  &MATH_DATATYPE&
  &: error when allocating tmp2 "//errorMessage
151
152
153
154
155
156
157
158
159
        stop 1
      endif

      tmp1 = 0
      tmp2 = 0

      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
160
161
  &MATH_DATATYPE&
  &: error when allocating tmatr "//errorMessage
162
163
164
165
166
167
        stop 1
      endif

      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
168
169
  &MATH_DATATYPE&
  &: error when allocating tmatc "//errorMessage
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
        stop 1
      endif

      tmatr = 0
      tmatc = 0

      do n = 1, na, nblk
        ! Calculate first local row and column of the still remaining matrix
        ! on the local processor

        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)

        if (n+nblk > na) then

          ! This is the last step, just do a Cholesky-Factorization
          ! of the remaining block

          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
192
            call obj%timer%start("blas")
193
194

            call PRECISION_POTRF('U', na-n+1, a(l_row1,l_col1), lda, info)
195
            call obj%timer%stop("blas")
196
197
198

            if (info/=0) then
              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
199
        &MATH_DATATYPE&
200
201

#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
202
        &: Error in dpotrf: ",info
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#endif
#if COMPLEXCASE == 1
              &: Error in zpotrf: ",info
#endif
              success = .false.
              return
            endif

          endif

          exit ! Loop

        endif

        if (my_prow==prow(n, nblk, np_rows)) then

          if (my_pcol==pcol(n, nblk, np_cols)) then

            ! The process owning the upper left remaining block does the
            ! Cholesky-Factorization of this block
223
            call obj%timer%start("blas")
224
225

            call PRECISION_POTRF('U', nblk, a(l_row1,l_col1), lda, info)
226
            call obj%timer%stop("blas")
227
228
229

            if (info/=0) then
              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
230
        &MATH_DATATYPE&
231
232

#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
233
        &: Error in dpotrf 2: ",info
234
235
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
236
        &: Error in zpotrf 2: ",info
237
238
239
240
241
242
243
244
245
246
247
248
249

#endif
              success = .false.
              return
            endif

            nc = 0
            do i=1,nblk
              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
              nc = nc+i
            enddo
          endif
#ifdef WITH_MPI
250
          call obj%timer%start("mpi_communication")
251
252
253

          call MPI_Bcast(tmp1, nblk*(nblk+1)/2,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
254
                   MPI_REAL_PRECISION,         &
255
256
257
258
#endif
#if COMPLEXCASE == 1
                         MPI_COMPLEX_PRECISION,      &
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
259
       pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
260

261
          call obj%timer%stop("mpi_communication")
262
263
264
265
266
267
268
269

#endif /* WITH_MPI */
          nc = 0
          do i=1,nblk
            tmp2(1:i,i) = tmp1(nc+1:nc+i)
            nc = nc+i
          enddo

270
          call obj%timer%start("blas")
271
          if (l_cols-l_colx+1>0) &
Pavel Kus's avatar
Pavel Kus committed
272
273
              call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', nblk, l_cols-l_colx+1, ONE, tmp2, &
                            ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
274
          call obj%timer%stop("blas")
275
276
277
278
279
280
281
282
283
284
285
286
287
        endif

        do i=1,nblk

#if REALCASE == 1
          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
#endif
#if COMPLEXCASE == 1
          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
#endif

#ifdef WITH_MPI

288
          call obj%timer%start("mpi_communication")
289
          if (l_cols-l_colx+1>0) &
Pavel Kus's avatar
Pavel Kus committed
290
291
            call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_MATH_DATATYPE_PRECISION, &
                           prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
292

293
          call obj%timer%stop("mpi_communication")
294
295
296
297
#endif /* WITH_MPI */
        enddo
        ! this has to be checked since it was changed substantially when doing type safe
        call elpa_transpose_vectors_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
298
299
300
301
  &MATH_DATATYPE&
  &_&
  &PRECISION &
                 (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
302
                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
303
                                      n, na, nblk, nblk, nrThreads)
304
305
306
307
308
309
310

        do i=0,(na-1)/tile_size
          lcs = max(l_colx,i*l_cols_tile+1)
          lce = min(l_cols,(i+1)*l_cols_tile)
          lrs = l_rowx
          lre = min(l_rows,(i+1)*l_rows_tile)
          if (lce<lcs .or. lre<lrs) cycle
311
          call obj%timer%start("blas")
Pavel Kus's avatar
Pavel Kus committed
312
          call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre-lrs+1, lce-lcs+1, nblk, -ONE,  &
313
                              tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
Pavel Kus's avatar
Pavel Kus committed
314
                              ONE, a(lrs,lcs), lda)
315
          call obj%timer%stop("blas")
316
317
318
319
320
321
322
323

        enddo

      enddo

      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"elpa_cholesky_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
324
325
  &MATH_DATATYPE&
  &: error when deallocating tmp1 "//errorMessage
326
327
328
329
330
331
332
333
334
335
336
337
338
        stop 1
      endif

      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)

      do i=1,na
        if (my_pcol==pcol(i, nblk, np_cols)) then
          ! column i is on local processor
          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
          a(l_row1:l_rows,l_col1) = 0
        endif
      enddo
339
340
341
342
343
344
345
346

      ! restore original OpenMP settings
#ifdef WITH_OPENMP
      ! store the number of OpenMP threads used in the calling function
      ! restore this at the end of ELPA 2
      call omp_set_num_threads(omp_threads_caller)
#endif
      
347
      call obj%timer%stop("elpa_cholesky_&
348
349
350
      &MATH_DATATYPE&
      &_&
      &PRECISION&
351
      &")