elpa1_auxiliary.F90 138 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

#include "config-f90.h"

Andreas Marek's avatar
Andreas Marek committed
55
56
!> \brief Fortran module which provides helper routines for matrix calculations
module ELPA1_AUXILIARY
57
58
  use elpa_utilities
  
59
60
  implicit none

61
62
  public :: elpa_mult_at_b_real_double      !< Multiply double-precision real matrices A**T * B
  public :: mult_at_b_real                  !< Old, deprecated interface to multiply double-precision real matrices A**T * B. DO NOT USE
63

64
65
  public :: elpa_mult_ah_b_complex_double   !< Multiply double-precision complex matrices A**H * B
  public :: mult_ah_b_complex               !< Old, deprecated interface to multiply double-precision complex matrices A**H * B. DO NOT USE
66

67
68
  public :: elpa_invert_trm_real_double     !< Invert double-precision real triangular matrix
  public :: invert_trm_real                 !< Old, deprecated interface for inversion of double-precision real triangular matrix. DO NOT USE
69

70
71
  public :: elpa_invert_trm_complex_double  !< Invert double-precision complex triangular matrix
  public :: invert_trm_complex              !< Old, deprecated interface to invert double-precision complex triangular matrix. DO NOT USE
72

73
74
  public :: elpa_cholesky_real_double       !< Cholesky factorization of a double-precision real matrix
  public :: cholesky_real                   !< Old, deprecated name for Cholesky factorization of a double-precision real matrix. DO NOT USE
75

76
77
  public :: elpa_cholesky_complex_double    !< Cholesky factorization of a double-precision complex matrix
  public :: cholesky_complex                !< Old, deprecated interface for a Cholesky factorization of a double-precision complex matrix. DO NOT USE
78

79
80
  public :: elpa_solve_tridi_double         !< Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
  public :: solve_tridi                     !< Old, deprecated interface to solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
81
82

#ifdef WANT_SINGLE_PRECISION_REAL
83
84
85
86
  public :: elpa_cholesky_real_single       !< Cholesky factorization of a single-precision real matrix
  public :: elpa_invert_trm_real_single     !< Invert single-precision real triangular matrix
  public :: elpa_mult_at_b_real_single      !< Multiply single-precision real matrices A**T * B
  public :: elpa_solve_tridi_single         !< Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
87
88
89
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
90
91
92
  public :: elpa_cholesky_complex_single    !< Cholesky factorization of a single-precision complex matrix
  public :: elpa_invert_trm_complex_single  !< Invert single-precision complex triangular matrix
  public :: elpa_mult_ah_b_complex_single   !< Multiply single-precision complex matrices A**H * B
93
94
#endif

95
!> \brief  cholesky_real: old, deprecated interface for Cholesky factorization of a double-precision real symmetric matrix
96
97
98
99
100
!> \details
!>
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
101
!>                              Only upper triangle needs to be set.
102
103
104
105
106
107
108
109
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
110
!> \result succes                logical, reports success or failure
111
  interface cholesky_real
112
    module procedure elpa_cholesky_real_double
113
114
  end interface

115
!> \brief  Old, deprecated interface invert_trm_real: Inverts a upper double-precision triangular matrix
116
117
118
119
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
120
!>                              Only upper triangle needs to be set.
121
122
123
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
124
!> \param  matrixCols           local columns of matrix a
125
126
127
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
128
!> \param result                logical, reports success or failure
129
130

  interface invert_trm_real
131
    module procedure elpa_invert_trm_real_double
132
133
  end interface

134
!> \brief  old, deprecated interface cholesky_complex: Cholesky factorization of a double-precision complex hermitian matrix
135
136
137
138
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
139
!>                              Only upper triangle needs to be set.
140
141
142
143
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
144
!> \param  matrixCols           local columns of matrix a
145
146
147
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
148
!> \result succes               logical, reports success or failure
149
150
151


  interface cholesky_complex
152
    module procedure elpa_cholesky_real_double
153
154
  end interface

155
!> \brief  old, deprecated interface invert_trm_complex: Inverts a double-precision complex upper triangular matrix
156
157
158
159
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
160
!>                              Only upper triangle needs to be set.
161
162
163
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
164
!> \param  matrixCols           local columns of matrix a
165
166
167
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
168
!> \result succes               logical, reports success or failure
169
170

  interface invert_trm_complex
171
    module procedure elpa_invert_trm_complex_double
172
173
  end interface

174
!> \brief  mult_at_b_real: Performs C : = A**T * B for double matrices
175
!> this is the old, deprecated interface for the newer elpa_mult_at_b_real
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
!>         where   A is a square matrix (na,na) which is optionally upper or lower triangular
!>                 B is a (na,ncb) matrix
!>                 C is a (na,ncb) matrix where optionally only the upper or lower
!>                   triangle may be computed
!> \details

!> \param  uplo_a               'U' if A is upper triangular
!>                              'L' if A is lower triangular
!>                              anything else if A is a full matrix
!>                              Please note: This pertains to the original A (as set in the calling program)
!>                                           whereas the transpose of A is used for calculations
!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
!>                              i.e. it may contain arbitrary numbers
!> \param uplo_c                'U' if only the upper diagonal part of C is needed
!>                              'L' if only the upper diagonal part of C is needed
!>                              anything else if the full matrix C is needed
!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na                    Number of rows/columns of A, number of rows of B and C
!> \param ncb                   Number of columns  of B and C
!> \param a                     matrix a
!> \param lda                   leading dimension of matrix a
!> \param b                     matrix b
!> \param ldb                   leading dimension of matrix b
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param c                     matrix c
!> \param ldc                   leading dimension of matrix c
  interface mult_at_b_real
206
    module procedure elpa_mult_at_b_real_double
207
208
  end interface

209
!> \brief  Old, deprecated interface mult_ah_b_complex: Performs C : = A**H * B for double-precision matrices
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
!>         where   A is a square matrix (na,na) which is optionally upper or lower triangular
!>                 B is a (na,ncb) matrix
!>                 C is a (na,ncb) matrix where optionally only the upper or lower
!>                   triangle may be computed
!> \details
!>
!> \param  uplo_a               'U' if A is upper triangular
!>                              'L' if A is lower triangular
!>                              anything else if A is a full matrix
!>                              Please note: This pertains to the original A (as set in the calling program)
!>                                           whereas the transpose of A is used for calculations
!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
!>                              i.e. it may contain arbitrary numbers
!> \param uplo_c                'U' if only the upper diagonal part of C is needed
!>                              'L' if only the upper diagonal part of C is needed
!>                              anything else if the full matrix C is needed
!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na                    Number of rows/columns of A, number of rows of B and C
!> \param ncb                   Number of columns  of B and C
!> \param a                     matrix a
!> \param lda                   leading dimension of matrix a
!> \param b                     matrix b
!> \param ldb                   leading dimension of matrix b
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param c                     matrix c
!> \param ldc                   leading dimension of matrix c
  interface mult_ah_b_complex
240
    module procedure elpa_mult_ah_b_complex_double
241
242
243
  end interface


244
!> \brief  solve_tridi: Old, deprecated interface to solve a double-precision tridiagonal eigensystem for a double-precision matrix with divide and conquer method
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
!> \details
!>
!> \param na                    Matrix dimension
!> \param nev                   number of eigenvalues/vectors to be computed
!> \param d                     array d(na) on input diagonal elements of tridiagonal matrix, on
!>                              output the eigenvalues in ascending order
!> \param e                     array e(na) on input subdiagonal elements of matrix, on exit destroyed
!> \param q                     on exit : matrix q(ldq,matrixCols) contains the eigenvectors
!> \param ldq                   leading dimension of matrix q
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param matrixCols            columns of matrix q
!> \param mpi_comm_rows         MPI communicator for rows
!> \param mpi_comm_cols         MPI communicator for columns
!> \param wantDebug             logical, give more debug information if .true.
!> \result success              logical, .true. on success, else .false.
  interface solve_tridi
261
    module procedure elpa_solve_tridi_double
262
  end interface
263

264
265
  contains

266
!> \brief  cholesky_real_double: Cholesky factorization of a double-precision real symmetric matrix
267
!> \details
268
269
270
271
!>
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
272
!>                              Only upper triangle needs to be set.
273
274
275
276
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
277
!> \param  matrixCols           local columns of matrix a
278
279
280
281
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
!> \param succes                logical, reports success or failure
282
283
284
285
286
287

#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8

288
289
   function elpa_cholesky_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
                                            wantDebug) result(success)
290
291
292
     use elpa1_compute
     use elpa_utilities
     use elpa_mpi
293
#ifdef HAVE_DETAILED_TIMINGS
294
     use timings
295
296
#else
     use timings_dummy
297
#endif
298
     use precision
299
300
301
      implicit none

      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
302
303
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE)      :: a(lda,*)
304
#else
305
      real(kind=REAL_DATATYPE)      :: a(lda,matrixCols)
306
#endif
307
308
309
310
311
312
      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
      integer(kind=ik)              :: n, nc, i, info
      integer(kind=ik)              :: lcs, lce, lrs, lre
      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile

313
      real(kind=REAL_DATATYPE), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
314
315

      logical, intent(in)           :: wantDebug
316
      logical                       :: success
317
318
319
      integer(kind=ik)              :: istat
      character(200)                :: errorMessage

320
#ifdef DOUBLE_PRECISION_REAL
321
      call timer%start("elpa_cholesky_real_double")
322
#else
323
      call timer%start("elpa_cholesky_real_single")
324
#endif
325

326
      call timer%start("mpi_communication")
327
328
329
330
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
331
      call timer%stop("mpi_communication")
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
      success = .true.

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
347
        print *,"elpa_cholesky_real: error when allocating tmp1 "//errorMessage
348
349
350
351
352
        stop
      endif

      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
353
        print *,"elpa_cholesky_real: error when allocating tmp2 "//errorMessage
354
355
356
357
358
359
360
361
        stop
      endif

      tmp1 = 0
      tmp2 = 0

      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
362
        print *,"elpa_cholesky_real: error when allocating tmatr "//errorMessage
363
364
365
366
367
        stop
      endif

      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
368
        print *,"elpa_cholesky_real: error when allocating tmatc "//errorMessage
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
        stop
      endif

      tmatr = 0
      tmatc = 0

      do n = 1, na, nblk
        ! Calculate first local row and column of the still remaining matrix
        ! on the local processor

        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)

        if (n+nblk > na) then

          ! This is the last step, just do a Cholesky-Factorization
          ! of the remaining block

          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
391
            call timer%start("blas")
392
393
394
395
396
#ifdef DOUBLE_PRECISION_REAL
            call dpotrf('U', na-n+1, a(l_row1,l_col1), lda, info)
#else
            call spotrf('U', na-n+1, a(l_row1,l_col1), lda, info)
#endif
397
398
            call timer%stop("blas")

399
            if (info/=0) then
400
              if (wantDebug) write(error_unit,*) "elpa_cholesky_real: Error in dpotrf 1: ",info
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
              success = .false.
              return
            endif

          endif

          exit ! Loop

        endif

        if (my_prow==prow(n, nblk, np_rows)) then

          if (my_pcol==pcol(n, nblk, np_cols)) then

            ! The process owning the upper left remaining block does the
            ! Cholesky-Factorization of this block
417
418
            call timer%start("blas")

419
420
421
422
423
#ifdef DOUBLE_PRECISION_REAL
            call dpotrf('U', nblk, a(l_row1,l_col1), lda, info)
#else
            call spotrf('U', nblk, a(l_row1,l_col1), lda, info)
#endif
424
425
            call timer%stop("blas")

426
            if (info/=0) then
427
              if (wantDebug) write(error_unit,*) "elpa_cholesky_real: Error in dpotrf 2: ",info
428
429
430
431
432
433
434
435
436
437
438
              success = .false.
              return
            endif

            nc = 0
            do i=1,nblk
              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
              nc = nc+i
            enddo
          endif
#ifdef WITH_MPI
439
          call timer%start("mpi_communication")
440
441
442
443
444

#ifdef DOUBLE_PRECISION_REAL
          call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
          call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
445
#endif
446

447
448
          call timer%stop("mpi_communication")

449
#endif /* WITH_MPI */
450
451
452
453
454
455
          nc = 0
          do i=1,nblk
            tmp2(1:i,i) = tmp1(nc+1:nc+i)
            nc = nc+i
          enddo

456
          call timer%start("blas")
457
          if (l_cols-l_colx+1>0) &
458
459
460
461
462
#ifdef DOUBLE_PRECISION_REAL
              call dtrsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
              call strsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
463
          call timer%stop("blas")
464
465
466
467
468
469
        endif

        do i=1,nblk

          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
#ifdef WITH_MPI
470

471
          call timer%start("mpi_communication")
472
          if (l_cols-l_colx+1>0) &
473
#ifdef DOUBLE_PRECISION_REAL
474
            call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
475
#else
476
            call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
477
#endif
478

479
          call timer%stop("mpi_communication")
480
#endif /* WITH_MPI */
481
482
        enddo
        ! this has to be checked since it was changed substantially when doing type safe
483
484
485
486
487
488
#ifdef DOUBLE_PRECISION_REAL
        call elpa_transpose_vectors_real_double  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
                                      n, na, nblk, nblk)
#else
        call elpa_transpose_vectors_real_single  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
489
490
                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
                                      n, na, nblk, nblk)
491
#endif
492
493
494
495
496
497
498

        do i=0,(na-1)/tile_size
          lcs = max(l_colx,i*l_cols_tile+1)
          lce = min(l_cols,(i+1)*l_cols_tile)
          lrs = l_rowx
          lre = min(l_rows,(i+1)*l_rows_tile)
          if (lce<lcs .or. lre<lrs) cycle
499
500
          call timer%start("blas")

501
502
503
504
505
506
507
508
509
#ifdef DOUBLE_PRECISION_REAL
          call DGEMM('N', 'T', lre-lrs+1, lce-lcs+1, nblk, -1.0_rk8,                        &
                     tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
                     1.0_rk8, a(lrs,lcs), lda)
#else
          call SGEMM('N', 'T', lre-lrs+1, lce-lcs+1, nblk, -1.0_rk4,                        &
                     tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
                     1.0_rk4, a(lrs,lcs), lda)
#endif
510
511
          call timer%stop("blas")

512
513
514
515
516
517
        enddo

      enddo

      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
518
        print *,"elpa_cholesky_real: error when deallocating tmp1 "//errorMessage
519
520
521
522
523
524
525
526
527
528
529
530
531
        stop
      endif

      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)

      do i=1,na
        if (my_pcol==pcol(i, nblk, np_cols)) then
          ! column i is on local processor
          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
          a(l_row1:l_rows,l_col1) = 0
        endif
      enddo
532
#ifdef DOUBLE_PRECISION_REAL
533
      call timer%stop("elpa_cholesky_real_double")
534
#else
535
      call timer%stop("elpa_cholesky_real_single")
536
537
#endif

538
    end function elpa_cholesky_real_double
539

540
541
542
543
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define REAL_DATATYPE rk4
544

545
!> \brief  cholesky_real_single: Cholesky factorization of a single-precision real symmetric matrix
546
!> \details
547
!>
548
!> \param  na                   Order of matrix
549
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
550
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
551
!>                              Only upper triangle needs to be set.
552
553
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
554
555
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
556
!> \param  matrixCols           local columns of matrix a
557
558
559
560
561
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
!> \param succes                logical, reports success or failure

562
563
   function elpa_cholesky_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
                                            wantDebug) result(success)
564
565
566
567
     use elpa1_compute
     use elpa_utilities
     use elpa_mpi
#ifdef HAVE_DETAILED_TIMINGS
568
     use timings
569
570
#else
     use timings_dummy
571
#endif
572
573
574
     use precision

     implicit none
575

576
      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
577
578
579
580
581
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE)      :: a(lda,*)
#else
      real(kind=REAL_DATATYPE)      :: a(lda,matrixCols)
#endif
582
583
584
585
586
      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
      integer(kind=ik)              :: n, nc, i, info
      integer(kind=ik)              :: lcs, lce, lrs, lre
      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
587

588
      real(kind=REAL_DATATYPE), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
589

590
      logical, intent(in)           :: wantDebug
591
      logical                       :: success
592
593
      integer(kind=ik)              :: istat
      character(200)                :: errorMessage
594

595
#ifdef DOUBLE_PRECISION_REAL
596
      call timer%start("elpa_cholesky_real_double")
597
#else
598
      call timer%start("elpa_cholesky_real_single")
599
#endif
600
601
602

      call timer%start("mpi_communication")

603
604
605
606
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
607
608
609

      call timer%stop("mpi_communication")

610
      success = .true.
611

612
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
613

614
615
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
616
617
618
619
620
621
622
623
624

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
625
        print *,"elpa_cholesky_real: error when allocating tmp1 "//errorMessage
626
627
628
629
630
        stop
      endif

      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
631
        print *,"elpa_cholesky_real: error when allocating tmp2 "//errorMessage
632
633
634
635
636
637
638
639
        stop
      endif

      tmp1 = 0
      tmp2 = 0

      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
640
        print *,"elpa_cholesky_real: error when allocating tmatr "//errorMessage
641
642
643
644
645
        stop
      endif

      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
646
        print *,"elpa_cholesky_real: error when allocating tmatc "//errorMessage
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
        stop
      endif

      tmatr = 0
      tmatc = 0

      do n = 1, na, nblk

        ! Calculate first local row and column of the still remaining matrix
        ! on the local processor

        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)

        if (n+nblk > na) then

          ! This is the last step, just do a Cholesky-Factorization
          ! of the remaining block

          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
670
671
            call timer%start("blas")

672
673
674
675
676
#ifdef DOUBLE_PRECISION_REAL
            call dpotrf('U', na-n+1, a(l_row1,l_col1), lda, info)
#else
            call spotrf('U', na-n+1, a(l_row1,l_col1), lda, info)
#endif
677
678
            call timer%stop("blas")

679
            if (info/=0) then
680
              if (wantDebug) write(error_unit,*) "elpa_cholesky_real: Error in dpotrf"
681
682
683
684
685
686
687
              success = .false.
              return
            endif

          endif

          exit ! Loop
688

689
690
691
692
693
694
695
696
        endif

        if (my_prow==prow(n, nblk, np_rows)) then

          if (my_pcol==pcol(n, nblk, np_cols)) then

            ! The process owning the upper left remaining block does the
            ! Cholesky-Factorization of this block
697
698
            call timer%start("blas")

699
700
701
702
703
#ifdef DOUBLE_PRECISION_REAL
            call dpotrf('U', nblk, a(l_row1,l_col1), lda, info)
#else
            call spotrf('U', nblk, a(l_row1,l_col1), lda, info)
#endif
704
705
            call timer%stop("blas")

706
            if (info/=0) then
707
              if (wantDebug) write(error_unit,*) "elpa_cholesky_real: Error in dpotrf"
708
709
710
711
712
713
714
715
716
717
718
              success = .false.
              return
            endif

            nc = 0
            do i=1,nblk
              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
              nc = nc+i
            enddo
          endif
#ifdef WITH_MPI
719
          call timer%start("mpi_communication")
720
721
722
723
724

#ifdef DOUBLE_PRECISION_REAL
          call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
          call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
725
#endif
726
          call timer%stop("mpi_communication")
727
#endif /* WITH_MPI */
728
729
730
731
732
733
          nc = 0
          do i=1,nblk
            tmp2(1:i,i) = tmp1(nc+1:nc+i)
            nc = nc+i
          enddo

734
          call timer%start("blas")
735
          if (l_cols-l_colx+1>0) &
736
737
738
739
740
#ifdef DOUBLE_PRECISION_REAL
              call dtrsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
              call strsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
741
742
          call timer%stop("blas")

743
744
745
746
        endif

        do i=1,nblk

747
          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
748
#ifdef WITH_MPI
749
          call timer%start("mpi_communication")
750
          if (l_cols-l_colx+1>0) &
751
752
753
754
#ifdef DOUBLE_PRECISION_REAL
              call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#else
              call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
755
#endif
756
          call timer%stop("mpi_communication")
757
#endif /* WITH_MPI */
758
759
        enddo
        ! this has to be checked since it was changed substantially when doing type safe
760
761
762
763
764
765
766
767
768
769
#ifdef DOUBLE_PRECISION_REAL
        call elpa_transpose_vectors_real_double  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
                                      n, na, nblk, nblk)
#else
        call elpa_transpose_vectors_real_single  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
                                      n, na, nblk, nblk)
#endif

770
771
772
773
774
775
        do i=0,(na-1)/tile_size
          lcs = max(l_colx,i*l_cols_tile+1)
          lce = min(l_cols,(i+1)*l_cols_tile)
          lrs = l_rowx
          lre = min(l_rows,(i+1)*l_rows_tile)
          if (lce<lcs .or. lre<lrs) cycle
776
777
          call timer%start("blas")

778
779
780
781
782
783
784
785
786
#ifdef DOUBLE_PRECISION_REAL
          call DGEMM('N', 'T', lre-lrs+1, lce-lcs+1, nblk, -1.0_rk8,                        &
                     tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
                     1.0_rk8, a(lrs,lcs), lda)
#else
          call SGEMM('N', 'T', lre-lrs+1, lce-lcs+1, nblk, -1.0_rk4,                        &
                     tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
                     1.0_rk4, a(lrs,lcs), lda)
#endif
787
          call timer%stop("blas")
788
789
790
791
792
793
        enddo

      enddo

      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
794
        print *,"elpa_cholesky_real: error when deallocating tmp1 "//errorMessage
795
796
797
798
799
800
801
802
803
804
805
806
807
        stop
      endif

      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)

      do i=1,na
        if (my_pcol==pcol(i, nblk, np_cols)) then
          ! column i is on local processor
          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
          a(l_row1:l_rows,l_col1) = 0
        endif
      enddo
808
#ifdef DOUBLE_PRECISION_REAL
809
      call timer%stop("elpa_cholesky_real_double")
810
#else
811
      call timer%stop("elpa_cholesky_real_single")
812
813
#endif

814
    end function elpa_cholesky_real_single
815

816
#endif /* WANT_SINGLE_PRECSION_REAL */
817

818
819
820
821
822
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
!> \brief  elpa_invert_trm_real_double: Inverts a double-precision real upper triangular matrix
823
!> \details
824
825
826
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
827
!>                              Only upper triangle needs to be set.
828
829
830
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
831
!> \param  matrixCols           local columns of matrix a
832
833
834
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
835
836
!> \result succes               logical, reports success or failure
    function elpa_invert_trm_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
837
838
839
840
       use precision
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
841
842
#ifdef HAVE_DETAILED_TIMINGS
       use timings
843
844
#else
       use timings_dummy
845
#endif
846
847
       implicit none

848
       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
849
850
#ifdef USE_ASSUMED_SIZE
       real(kind=REAL_DATATYPE)     :: a(lda,*)
851
#else
852
       real(kind=REAL_DATATYPE)     :: a(lda,matrixCols)
853
#endif
854
855
856
       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)             :: n, nc, i, info, ns, nb
857

858
       real(kind=REAL_DATATYPE), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
859

860
       logical, intent(in)          :: wantDebug
861
       logical                      :: success
862
863
       integer(kind=ik)             :: istat
       character(200)               :: errorMessage
864

865
       call timer%start("mpi_communication")
866
867
868
869
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
870
       call timer%stop("mpi_communication")
871
872
873
874
875
876
877
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
878
         print *,"elpa_invert_trm_real: error when allocating tmp1 "//errorMessage
879
880
881
882
883
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
884
         print *,"elpa_invert_trm_real: error when allocating tmp2 "//errorMessage
885
886
887
888
889
890
891
892
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
893
         print *,"elpa_invert_trm_real: error when allocating tmat1 "//errorMessage
894
895
896
897
898
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
899
         print *,"elpa_invert_trm_real: error when allocating tmat2 "//errorMessage
900
901
902
         stop
       endif

903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
       tmat1 = 0
       tmat2 = 0


       ns = ((na-1)/nblk)*nblk + 1

       do n = ns,1,-nblk

         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

         nb = nblk
         if (na-n+1 < nblk) nb = na-n+1

         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)

         if (my_prow==prow(n, nblk, np_rows)) then

           if (my_pcol==pcol(n, nblk, np_cols)) then
923
             call timer%start("blas")
924
925
926
927
928
#ifdef DOUBLE_PRECISION_REAL
             call DTRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#else
             call STRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#endif
929
930
             call timer%stop("blas")

931
             if (info/=0) then
932
               if (wantDebug) write(error_unit,*) "elpa_invert_trm_real: Error in DTRTRI"
933
934
935
936
937
938
939
940
941
942
943
               success = .false.
               return
             endif

             nc = 0
             do i=1,nb
               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
               nc = nc+i
             enddo
           endif
#ifdef WITH_MPI
944
           call timer%start("mpi_communication")
945
946
947
948
949
#ifdef DOUBLE_PRECISION_REAL
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
950
           call timer%stop("mpi_communication")
951
952
953
954
955
956
957
#endif /* WITH_MPI */
           nc = 0
           do i=1,nb
             tmp2(1:i,i) = tmp1(nc+1:nc+i)
             nc = nc+i
           enddo

958
           call timer%start("blas")
959
960
961
962
963
964
           if (l_cols-l_colx+1>0) &
#ifdef DOUBLE_PRECISION_REAL
               call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
               call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
965
           call timer%stop("blas")
966
967
968
969
970
971
972
973
974
975
976
977
978
           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0

         endif

         if (l_row1>1) then
           if (my_pcol==pcol(n, nblk, np_cols)) then
             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
           endif

           do i=1,nb
#ifdef WITH_MPI
979
             call timer%start("mpi_communication")
980
981
982
983
984
#ifdef DOUBLE_PRECISION_REAL
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
985
             call timer%stop("mpi_communication")
986
987
988
989
#endif /* WITH_MPI */
           enddo
         endif
#ifdef WITH_MPI
990
         call timer%start("mpi_communication")
991
992
993
994
995
996
         if (l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#else
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#endif
997
          call timer%stop("mpi_communication")
998
#endif /* WITH_MPI */
999
1000

         call timer%start("blas")
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
         if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk8,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk8, a(1,l_col1), lda)
#else
            call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk4,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk4, a(1,l_col1), lda)
#endif
1011
1012
         call timer%stop("blas")

1013
1014
1015
1016
       enddo

       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1017
         print *,"elpa_invert_trm_real: error when deallocating tmp1 "//errorMessage
1018
1019
         stop
       endif
1020
     end function elpa_invert_trm_real_double
1021
1022
1023
1024
1025

#if WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define REAL_DATATYPE rk4
1026
!> \brief  elpa_invert_trm_real_single: Inverts a single-precision real upper triangular matrix
1027
1028
1029
1030
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
1031
!>                              Only upper triangle needs to be set.
1032
1033
1034
1035
1036
1037
1038
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
1039
1040
!> \result succes               logical, reports success or failure
    function elpa_invert_trm_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
1041
1042
1043
1044
       use precision
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
1045
1046
#ifdef HAVE_DETAILED_TIMINGS
       use timings
1047
1048
#else
       use timings_dummy
1049
#endif
1050
1051
1052
       implicit none

       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
1053
1054
#ifdef USE_ASSUMED_SIZE
       real(kind=REAL_DATATYPE)     :: a(lda,*)
1055
#else
1056
       real(kind=REAL_DATATYPE)     :: a(lda,matrixCols)
1057
1058
1059
1060
1061
1062
1063
1064
#endif
       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)             :: n, nc, i, info, ns, nb

       real(kind=REAL_DATATYPE), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)

       logical, intent(in)          :: wantDebug
1065
       logical                      :: success
1066
1067
       integer(kind=ik)             :: istat
       character(200)               :: errorMessage
1068

1069
       call timer%start("mpi_communication")
1070
1071
1072
1073
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
1074
       call timer%stop("mpi_communication")
1075
1076
1077
1078
1079
1080
1081
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1082
         print *,"elpa_invert_trm_real: error when allocating tmp1 "//errorMessage
1083
1084
1085
1086
1087
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1088
         print *,"elpa_invert_trm_real: error when allocating tmp2 "//errorMessage
1089
1090
1091
1092
1093
1094
1095
1096
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1097
         print *,"elpa_invert_trm_real: error when allocating tmat1 "//errorMessage
1098
1099
1100
1101
1102
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1103
         print *,"elpa_invert_trm_real: error when allocating tmat2 "//errorMessage