elpa1_auxiliary.F90 103 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

#include "config-f90.h"

Andreas Marek's avatar
Andreas Marek committed
55
56
!> \brief Fortran module which provides helper routines for matrix calculations
module ELPA1_AUXILIARY
57
  use elpa_utilities
Andreas Marek's avatar
Andreas Marek committed
58

59
60
  implicit none

61
62
  public :: elpa_mult_at_b_real_double      !< Multiply double-precision real matrices A**T * B
  public :: mult_at_b_real                  !< Old, deprecated interface to multiply double-precision real matrices A**T * B. DO NOT USE
63

64
65
  public :: elpa_mult_ah_b_complex_double   !< Multiply double-precision complex matrices A**H * B
  public :: mult_ah_b_complex               !< Old, deprecated interface to multiply double-precision complex matrices A**H * B. DO NOT USE
66

67
68
  public :: elpa_invert_trm_real_double     !< Invert double-precision real triangular matrix
  public :: invert_trm_real                 !< Old, deprecated interface for inversion of double-precision real triangular matrix. DO NOT USE
69

70
71
  public :: elpa_invert_trm_complex_double  !< Invert double-precision complex triangular matrix
  public :: invert_trm_complex              !< Old, deprecated interface to invert double-precision complex triangular matrix. DO NOT USE
72

73
74
  public :: elpa_cholesky_real_double       !< Cholesky factorization of a double-precision real matrix
  public :: cholesky_real                   !< Old, deprecated name for Cholesky factorization of a double-precision real matrix. DO NOT USE
75

76
77
  public :: elpa_cholesky_complex_double    !< Cholesky factorization of a double-precision complex matrix
  public :: cholesky_complex                !< Old, deprecated interface for a Cholesky factorization of a double-precision complex matrix. DO NOT USE
78

79
80
  public :: elpa_solve_tridi_double         !< Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
  public :: solve_tridi                     !< Old, deprecated interface to solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
81
82

#ifdef WANT_SINGLE_PRECISION_REAL
83
84
85
86
  public :: elpa_cholesky_real_single       !< Cholesky factorization of a single-precision real matrix
  public :: elpa_invert_trm_real_single     !< Invert single-precision real triangular matrix
  public :: elpa_mult_at_b_real_single      !< Multiply single-precision real matrices A**T * B
  public :: elpa_solve_tridi_single         !< Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
87
88
89
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
90
91
92
  public :: elpa_cholesky_complex_single    !< Cholesky factorization of a single-precision complex matrix
  public :: elpa_invert_trm_complex_single  !< Invert single-precision complex triangular matrix
  public :: elpa_mult_ah_b_complex_single   !< Multiply single-precision complex matrices A**H * B
93
94
#endif

95
!> \brief  cholesky_real: old, deprecated interface for Cholesky factorization of a double-precision real symmetric matrix
96
97
98
99
100
!> \details
!>
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
101
!>                              Only upper triangle needs to be set.
102
103
104
105
106
107
108
109
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
110
!> \result succes                logical, reports success or failure
111
  interface cholesky_real
112
    module procedure elpa_cholesky_real_double
113
114
  end interface

115
!> \brief  Old, deprecated interface invert_trm_real: Inverts a upper double-precision triangular matrix
116
117
118
119
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
120
!>                              Only upper triangle needs to be set.
121
122
123
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
124
!> \param  matrixCols           local columns of matrix a
125
126
127
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
128
!> \param result                logical, reports success or failure
129
130

  interface invert_trm_real
131
    module procedure elpa_invert_trm_real_double
132
133
  end interface

134
!> \brief  old, deprecated interface cholesky_complex: Cholesky factorization of a double-precision complex hermitian matrix
135
136
137
138
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
139
!>                              Only upper triangle needs to be set.
140
141
142
143
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
144
!> \param  matrixCols           local columns of matrix a
145
146
147
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
148
!> \result succes               logical, reports success or failure
149
150
151


  interface cholesky_complex
152
    module procedure elpa_cholesky_real_double
153
154
  end interface

155
!> \brief  old, deprecated interface invert_trm_complex: Inverts a double-precision complex upper triangular matrix
156
157
158
159
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
160
!>                              Only upper triangle needs to be set.
161
162
163
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
164
!> \param  matrixCols           local columns of matrix a
165
166
167
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
168
!> \result succes               logical, reports success or failure
169
170

  interface invert_trm_complex
171
    module procedure elpa_invert_trm_complex_double
172
173
  end interface

174
!> \brief  mult_at_b_real: Performs C : = A**T * B for double matrices
175
!> this is the old, deprecated interface for the newer elpa_mult_at_b_real
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
!>         where   A is a square matrix (na,na) which is optionally upper or lower triangular
!>                 B is a (na,ncb) matrix
!>                 C is a (na,ncb) matrix where optionally only the upper or lower
!>                   triangle may be computed
!> \details

!> \param  uplo_a               'U' if A is upper triangular
!>                              'L' if A is lower triangular
!>                              anything else if A is a full matrix
!>                              Please note: This pertains to the original A (as set in the calling program)
!>                                           whereas the transpose of A is used for calculations
!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
!>                              i.e. it may contain arbitrary numbers
!> \param uplo_c                'U' if only the upper diagonal part of C is needed
!>                              'L' if only the upper diagonal part of C is needed
!>                              anything else if the full matrix C is needed
!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na                    Number of rows/columns of A, number of rows of B and C
!> \param ncb                   Number of columns  of B and C
!> \param a                     matrix a
!> \param lda                   leading dimension of matrix a
!> \param b                     matrix b
!> \param ldb                   leading dimension of matrix b
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param c                     matrix c
!> \param ldc                   leading dimension of matrix c
  interface mult_at_b_real
206
    module procedure elpa_mult_at_b_real_double
207
208
  end interface

209
!> \brief  Old, deprecated interface mult_ah_b_complex: Performs C : = A**H * B for double-precision matrices
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
!>         where   A is a square matrix (na,na) which is optionally upper or lower triangular
!>                 B is a (na,ncb) matrix
!>                 C is a (na,ncb) matrix where optionally only the upper or lower
!>                   triangle may be computed
!> \details
!>
!> \param  uplo_a               'U' if A is upper triangular
!>                              'L' if A is lower triangular
!>                              anything else if A is a full matrix
!>                              Please note: This pertains to the original A (as set in the calling program)
!>                                           whereas the transpose of A is used for calculations
!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
!>                              i.e. it may contain arbitrary numbers
!> \param uplo_c                'U' if only the upper diagonal part of C is needed
!>                              'L' if only the upper diagonal part of C is needed
!>                              anything else if the full matrix C is needed
!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na                    Number of rows/columns of A, number of rows of B and C
!> \param ncb                   Number of columns  of B and C
!> \param a                     matrix a
!> \param lda                   leading dimension of matrix a
!> \param b                     matrix b
!> \param ldb                   leading dimension of matrix b
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param c                     matrix c
!> \param ldc                   leading dimension of matrix c
  interface mult_ah_b_complex
240
    module procedure elpa_mult_ah_b_complex_double
241
242
243
  end interface


244
!> \brief  solve_tridi: Old, deprecated interface to solve a double-precision tridiagonal eigensystem for a double-precision matrix with divide and conquer method
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
!> \details
!>
!> \param na                    Matrix dimension
!> \param nev                   number of eigenvalues/vectors to be computed
!> \param d                     array d(na) on input diagonal elements of tridiagonal matrix, on
!>                              output the eigenvalues in ascending order
!> \param e                     array e(na) on input subdiagonal elements of matrix, on exit destroyed
!> \param q                     on exit : matrix q(ldq,matrixCols) contains the eigenvectors
!> \param ldq                   leading dimension of matrix q
!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
!> \param matrixCols            columns of matrix q
!> \param mpi_comm_rows         MPI communicator for rows
!> \param mpi_comm_cols         MPI communicator for columns
!> \param wantDebug             logical, give more debug information if .true.
!> \result success              logical, .true. on success, else .false.
  interface solve_tridi
261
    module procedure elpa_solve_tridi_double
262
  end interface
263

264
265
  contains

266
!> \brief  cholesky_real_double: Cholesky factorization of a double-precision real symmetric matrix
267
!> \details
268
269
270
271
!>
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
272
!>                              Only upper triangle needs to be set.
273
274
275
276
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
277
!> \param  matrixCols           local columns of matrix a
278
279
280
281
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
!> \param succes                logical, reports success or failure
282

Andreas Marek's avatar
Andreas Marek committed
283
284
285
286
#define REALCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"

287

288
289
   function elpa_cholesky_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
                                            wantDebug) result(success)
Andreas Marek's avatar
Andreas Marek committed
290
#include "elpa_cholesky_template.X90"
291

292
    end function elpa_cholesky_real_double
293

294
#ifdef WANT_SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
295
296
297
#define REALCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
298

299
!> \brief  cholesky_real_single: Cholesky factorization of a single-precision real symmetric matrix
300
!> \details
301
!>
302
!> \param  na                   Order of matrix
303
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
304
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
305
!>                              Only upper triangle needs to be set.
306
307
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
308
309
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
310
!> \param  matrixCols           local columns of matrix a
311
312
313
314
315
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
!> \param succes                logical, reports success or failure

316
317
   function elpa_cholesky_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
                                            wantDebug) result(success)
Andreas Marek's avatar
Andreas Marek committed
318
#include "elpa_cholesky_template.X90"
319

320
    end function elpa_cholesky_real_single
321

322
#endif /* WANT_SINGLE_PRECSION_REAL */
323

324
325
326
327
328
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
!> \brief  elpa_invert_trm_real_double: Inverts a double-precision real upper triangular matrix
329
!> \details
330
331
332
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
Andreas Marek's avatar
Andreas Marek committed
333
!>                              Only upper triangle needs to be set.
334
335
336
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
Andreas Marek's avatar
Andreas Marek committed
337
!> \param  matrixCols           local columns of matrix a
338
339
340
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
341
342
!> \result succes               logical, reports success or failure
    function elpa_invert_trm_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
343
344
345
346
       use precision
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
347
348
#ifdef HAVE_DETAILED_TIMINGS
       use timings
349
350
#else
       use timings_dummy
351
#endif
352
353
       implicit none

354
       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
355
356
#ifdef USE_ASSUMED_SIZE
       real(kind=REAL_DATATYPE)     :: a(lda,*)
357
#else
358
       real(kind=REAL_DATATYPE)     :: a(lda,matrixCols)
359
#endif
360
361
362
       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)             :: n, nc, i, info, ns, nb
363

364
       real(kind=REAL_DATATYPE), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
365

366
       logical, intent(in)          :: wantDebug
367
       logical                      :: success
368
369
       integer(kind=ik)             :: istat
       character(200)               :: errorMessage
370

371
       call timer%start("mpi_communication")
372
373
374
375
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
376
       call timer%stop("mpi_communication")
377
378
379
380
381
382
383
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
384
         print *,"elpa_invert_trm_real: error when allocating tmp1 "//errorMessage
385
386
387
388
389
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
390
         print *,"elpa_invert_trm_real: error when allocating tmp2 "//errorMessage
391
392
393
394
395
396
397
398
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
399
         print *,"elpa_invert_trm_real: error when allocating tmat1 "//errorMessage
400
401
402
403
404
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
405
         print *,"elpa_invert_trm_real: error when allocating tmat2 "//errorMessage
406
407
408
         stop
       endif

409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
       tmat1 = 0
       tmat2 = 0


       ns = ((na-1)/nblk)*nblk + 1

       do n = ns,1,-nblk

         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

         nb = nblk
         if (na-n+1 < nblk) nb = na-n+1

         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)

         if (my_prow==prow(n, nblk, np_rows)) then

           if (my_pcol==pcol(n, nblk, np_cols)) then
429
             call timer%start("blas")
430
431
432
433
434
#ifdef DOUBLE_PRECISION_REAL
             call DTRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#else
             call STRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#endif
435
436
             call timer%stop("blas")

437
             if (info/=0) then
438
               if (wantDebug) write(error_unit,*) "elpa_invert_trm_real: Error in DTRTRI"
439
440
441
442
443
444
445
446
447
448
449
               success = .false.
               return
             endif

             nc = 0
             do i=1,nb
               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
               nc = nc+i
             enddo
           endif
#ifdef WITH_MPI
450
           call timer%start("mpi_communication")
451
452
453
454
455
#ifdef DOUBLE_PRECISION_REAL
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
456
           call timer%stop("mpi_communication")
457
458
459
460
461
462
463
#endif /* WITH_MPI */
           nc = 0
           do i=1,nb
             tmp2(1:i,i) = tmp1(nc+1:nc+i)
             nc = nc+i
           enddo

464
           call timer%start("blas")
465
466
467
468
469
470
           if (l_cols-l_colx+1>0) &
#ifdef DOUBLE_PRECISION_REAL
               call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
               call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
471
           call timer%stop("blas")
472
473
474
475
476
477
478
479
480
481
482
483
484
           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0

         endif

         if (l_row1>1) then
           if (my_pcol==pcol(n, nblk, np_cols)) then
             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
           endif

           do i=1,nb
#ifdef WITH_MPI
485
             call timer%start("mpi_communication")
486
487
488
489
490
#ifdef DOUBLE_PRECISION_REAL
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
491
             call timer%stop("mpi_communication")
492
493
494
495
#endif /* WITH_MPI */
           enddo
         endif
#ifdef WITH_MPI
496
         call timer%start("mpi_communication")
497
498
499
500
501
502
         if (l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#else
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#endif
503
          call timer%stop("mpi_communication")
504
#endif /* WITH_MPI */
505
506

         call timer%start("blas")
507
508
509
510
511
512
513
514
515
516
         if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk8,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk8, a(1,l_col1), lda)
#else
            call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk4,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk4, a(1,l_col1), lda)
#endif
517
518
         call timer%stop("blas")

519
520
521
522
       enddo

       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
523
         print *,"elpa_invert_trm_real: error when deallocating tmp1 "//errorMessage
524
525
         stop
       endif
526
     end function elpa_invert_trm_real_double
527
528
529
530
531

#if WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#define REAL_DATATYPE rk4
532
!> \brief  elpa_invert_trm_real_single: Inverts a single-precision real upper triangular matrix
533
534
535
536
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
537
!>                              Only upper triangle needs to be set.
538
539
540
541
542
543
544
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
545
546
!> \result succes               logical, reports success or failure
    function elpa_invert_trm_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
547
548
549
550
       use precision
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
551
552
#ifdef HAVE_DETAILED_TIMINGS
       use timings
553
554
#else
       use timings_dummy
555
#endif
556
557
558
       implicit none

       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
559
560
#ifdef USE_ASSUMED_SIZE
       real(kind=REAL_DATATYPE)     :: a(lda,*)
561
#else
562
       real(kind=REAL_DATATYPE)     :: a(lda,matrixCols)
563
564
565
566
567
568
569
570
#endif
       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)             :: n, nc, i, info, ns, nb

       real(kind=REAL_DATATYPE), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)

       logical, intent(in)          :: wantDebug
571
       logical                      :: success
572
573
       integer(kind=ik)             :: istat
       character(200)               :: errorMessage
574

575
       call timer%start("mpi_communication")
576
577
578
579
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
580
       call timer%stop("mpi_communication")
581
582
583
584
585
586
587
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
588
         print *,"elpa_invert_trm_real: error when allocating tmp1 "//errorMessage
589
590
591
592
593
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
594
         print *,"elpa_invert_trm_real: error when allocating tmp2 "//errorMessage
595
596
597
598
599
600
601
602
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
603
         print *,"elpa_invert_trm_real: error when allocating tmat1 "//errorMessage
604
605
606
607
608
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
609
         print *,"elpa_invert_trm_real: error when allocating tmat2 "//errorMessage
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
         stop
       endif

       tmat1 = 0
       tmat2 = 0


       ns = ((na-1)/nblk)*nblk + 1

       do n = ns,1,-nblk

         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

         nb = nblk
         if (na-n+1 < nblk) nb = na-n+1

         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)

         if (my_prow==prow(n, nblk, np_rows)) then

           if (my_pcol==pcol(n, nblk, np_cols)) then
633
             call timer%start("blas")
634
635
636
637
638
#ifdef DOUBLE_PRECISION_REAL
             call DTRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#else
             call STRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#endif
639
640
            call timer%stop("blas")

641
             if (info/=0) then
642
               if (wantDebug) write(error_unit,*) "elpa_invert_trm_real: Error in DTRTRI"
643
644
645
646
647
648
649
650
651
652
653
               success = .false.
               return
             endif

             nc = 0
             do i=1,nb
               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
               nc = nc+i
             enddo
           endif
#ifdef WITH_MPI
654
           call timer%start("mpi_communication")
655
656
657
658
659
#ifdef DOUBLE_PRECISION_REAL
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
660
           call timer%stop("mpi_communication")
661
662
663
664
665
666
667
#endif /* WITH_MPI */
           nc = 0
           do i=1,nb
             tmp2(1:i,i) = tmp1(nc+1:nc+i)
             nc = nc+i
           enddo

668
669
            call timer%start("blas")

670
671
672
673
674
675
           if (l_cols-l_colx+1>0) &
#ifdef DOUBLE_PRECISION_REAL
               call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
               call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
676
           call timer%stop("blas")
677
678
679
680
681
682
683
684
685
686
687
688
689
           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0

         endif

         if (l_row1>1) then
           if (my_pcol==pcol(n, nblk, np_cols)) then
             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
           endif

           do i=1,nb
#ifdef WITH_MPI
690
             call timer%start("mpi_communication")
691
692
693
694
695
#ifdef DOUBLE_PRECISION_REAL
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
696
             call timer%stop("mpi_communication")
697
698
699
700
#endif /* WITH_MPI */
           enddo
         endif
#ifdef WITH_MPI
701
         call timer%start("mpi_communication")
702
703
704
705
706
707
         if (l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#else
            call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr)
#endif
708
          call timer%stop("mpi_communication")
709
#endif /* WITH_MPI */
710
711

         call timer%start("blas")
712
713
714
715
716
717
718
719
720
721
         if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_REAL
            call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk8,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk8, a(1,l_col1), lda)
#else
            call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk4,                 &
                       tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                       1.0_rk4, a(1,l_col1), lda)
#endif
722
723
         call timer%stop("blas")

724
725
726
727
       enddo

       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
728
         print *,"elpa_invert_trm_real: error when deallocating tmp1 "//errorMessage
729
730
         stop
       endif
731
     end function elpa_invert_trm_real_single
732
733
734

#endif /* WANT_SINGLE_PRECISION_REAL */

735

Andreas Marek's avatar
Andreas Marek committed
736
737
738
739
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"

740
!> \brief  elpa_cholesky_complex_double: Cholesky factorization of a double-precision complex hermitian matrix
741
742
743
744
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
745
!>                              Only upper triangle needs to be set.
746
747
748
749
750
751
752
753
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
754
755
!> \result succes               logical, reports success or failure
    function elpa_cholesky_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
756

Andreas Marek's avatar
Andreas Marek committed
757
#include "elpa_cholesky_template.X90"
758

759
    end function elpa_cholesky_complex_double
760

Andreas Marek's avatar
Andreas Marek committed
761

762
#ifdef WANT_SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
763
764
765
766
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"

767
!> \brief  elpa_cholesky_complex_single: Cholesky factorization of a single-precision complex hermitian matrix
768
769
770
771
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
!>                              Distribution is like in Scalapack.
772
!>                              Only upper triangle needs to be set.
773
774
775
776
777
778
779
780
!>                              On return, the upper triangle contains the Cholesky factor
!>                              and the lower triangle is set to 0.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
781
782
!> \result succes               logical, reports success or failure
    function elpa_cholesky_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
783

Andreas Marek's avatar
Andreas Marek committed
784
#include "elpa_cholesky_template.X90"
785

786
787
788
    end function elpa_cholesky_complex_single

#endif /* WANT_SINGLE_PRECISION_COMPLEX */
789
790


791
792
793
794
795
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
#define DOUBLE_PRECISION_COMPLEX 1
#define COMPLEX_DATATYPE CK8
!> \brief  elpa_invert_trm_complex_double: Inverts a double-precision complex upper triangular matrix
796
797
798
799
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
800
!>                              Only upper triangle needs to be set.
801
802
803
804
805
806
807
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
808
809
810
!> \result succes               logical, reports success or failure

     function elpa_invert_trm_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
811
812
813
814
815
816
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
       use precision
#ifdef HAVE_DETAILED_TIMINGS
      use timings
817
818
#else
      use timings_dummy
819
820
821
#endif
       implicit none
       integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
822
823
#ifdef USE_ASSUMED_SIZE
       complex(kind=COMPLEX_DATATYPE)   :: a(lda,*)
824
#else
825
       complex(kind=COMPLEX_DATATYPE)   :: a(lda,matrixCols)
826
827
828
829
830
831
832
833
#endif
       integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)                 :: n, nc, i, info, ns, nb

       complex(kind=COMPLEX_DATATYPE), allocatable    :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)

       logical, intent(in)              :: wantDebug
834
       logical                          :: success
835
836
837
       integer(kind=ik)                 :: istat
       character(200)                   :: errorMessage
#ifdef DOUBLE_PRECISION_COMPLEX
838
      call timer%start("elpa_invert_trm_complex_double")
839
#else
840
      call timer%start("elpa_invert_trm_complex_single")
841
#endif
842
      call timer%start("mpi_communication")
843
844
845
846
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
847
       call timer%stop("mpi_communication")
848
849
850
851
852
853
854
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
855
         print *,"elpa_invert_trm_complex: error when allocating tmp1 "//errorMessage
856
857
858
859
860
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
861
         print *,"elpa_invert_trm_complex: error when allocating tmp2 "//errorMessage
862
863
864
865
866
867
868
869
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
870
         print *,"elpa_invert_trm_complex: error when allocating tmat1 "//errorMessage
871
872
873
874
875
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
876
         print *,"elpa_invert_trm_complex: error when allocating tmat2 "//errorMessage
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
         stop
       endif

       tmat1 = 0
       tmat2 = 0

       ns = ((na-1)/nblk)*nblk + 1

       do n = ns,1,-nblk

         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

         nb = nblk
         if (na-n+1 < nblk) nb = na-n+1

         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)

         if (my_prow==prow(n, nblk, np_rows)) then

           if (my_pcol==pcol(n, nblk, np_cols)) then
899
             call timer%start("blas")
900
901
902
903
904
#ifdef DOUBLE_PRECISION_COMPLEX
             call ZTRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#else
             call CTRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
#endif
905
             call timer%stop("blas")
906
             if (info/=0) then
907
               if (wantDebug) write(error_unit,*) "elpa_invert_trm_complex: Error in ZTRTRI"
908
909
910
911
912
913
914
915
916
917
918
               success = .false.
               return
             endif

             nc = 0
             do i=1,nb
               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
               nc = nc+i
             enddo
           endif
#ifdef WITH_MPI
919
           call timer%start("mpi_communication")
920
921
922
923
924
#ifdef DOUBLE_PRECISION_COMPLEX
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
           call MPI_Bcast(tmp1, nb*(nb+1)/2, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
925
           call timer%stop("mpi_communication")
926
927
928
929
930
931
932
#endif /* WITH_MPI */
           nc = 0
           do i=1,nb
             tmp2(1:i,i) = tmp1(nc+1:nc+i)
             nc = nc+i
           enddo

933
           call timer%start("blas")
934
935
936
937
938
939
           if (l_cols-l_colx+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
             call ZTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk8,0.0_rk8), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#else
             call CTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk4,0.0_rk4), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
#endif
940
           call timer%start("blas")
941
942
943
944
945
946
947
948
949
950
951
952
953
           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0

         endif

         if (l_row1>1) then
           if (my_pcol==pcol(n, nblk, np_cols)) then
             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
           endif

           do i=1,nb
#ifdef WITH_MPI
954
             call timer%start("mpi_communication")
955
956
957
958
959
#ifdef DOUBLE_PRECISION_COMPLEX
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#else
             call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
960
             call timer%stop("mpi_communication")
961
962
963
964
#endif /* WITH_MPI */
           enddo
         endif
#ifdef WITH_MPI
965
          call timer%start("mpi_communication")
966
967
968
969
970
971
972
973
         if (l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
           call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), &
                          mpi_comm_rows, mpierr)
#else
           call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_COMPLEX, prow(n, nblk, np_rows), &
                          mpi_comm_rows, mpierr)
#endif
974
          call timer%stop("mpi_communication")
975
976
#endif /* WITH_MPI */

977
         call timer%start("blas")
978
979
980
981
982
983
984
985
986
987
         if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
           call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8),        &
                      tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                      (1.0_rk8,0.0_rk8), a(1,l_col1), lda)
#else
           call CGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk4,0.0_rk4),        &
                      tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), &
                      (1.0_rk4,0.0_rk4), a(1,l_col1), lda)
#endif
988
          call timer%stop("blas")
989
990
991
992
       enddo

       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
993
         print *,"elpa_invert_trm_complex: error when deallocating tmp1 "//errorMessage
994
995
996
997
         stop
       endif

#ifdef DOUBLE_PRECISION_COMPLEX
998
      call timer%stop("elpa_invert_trm_complex_double")
999
#else
1000
      call timer%stop("elpa_invert_trm_complex_single")
1001
1002
#endif

1003
    end function elpa_invert_trm_complex_double
1004
1005
1006
1007
1008
1009

#ifdef WANT_SINGLE_PRECISION_COMPLEX

#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
#define COMPLEX_DATATYPE CK4
1010
!> \brief  elpa_invert_trm_complex_single: Inverts a single-precision complex upper triangular matrix
1011
1012
1013
1014
!> \details
!> \param  na                   Order of matrix
!> \param  a(lda,matrixCols)    Distributed matrix which should be inverted
!>                              Distribution is like in Scalapack.
1015
!>                              Only upper triangle needs to be set.
1016
1017
1018
1019
1020
1021
1022
!>                              The lower triangle is not referenced.
!> \param  lda                  Leading dimension of a
!> \param                       matrixCols  local columns of matrix a
!> \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
!> \param  mpi_comm_rows        MPI communicator for rows
!> \param  mpi_comm_cols        MPI communicator for columns
!> \param wantDebug             logical, more debug information on failure
1023
1024
1025
!> \result succes               logical, reports success or failure

    function elpa_invert_trm_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug) result(success)
1026
1027
1028
       use elpa1_compute
       use elpa_utilities
       use elpa_mpi
1029
       use precision
1030
1031
#ifdef HAVE_DETAILED_TIMINGS
      use timings
1032
1033
#else
      use timings_dummy
1034
1035
1036
#endif
       implicit none
       integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
1037
1038
#ifdef USE_ASSUMED_SIZE
       complex(kind=COMPLEX_DATATYPE)   :: a(lda,*)
1039
#else
1040
       complex(kind=COMPLEX_DATATYPE)   :: a(lda,matrixCols)
1041
1042
1043
1044
1045
1046
1047
1048
#endif
       integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
       integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
       integer(kind=ik)                 :: n, nc, i, info, ns, nb

       complex(kind=COMPLEX_DATATYPE), allocatable    :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)

       logical, intent(in)              :: wantDebug
1049
       logical                          :: success
1050
1051
1052
       integer(kind=ik)                 :: istat
       character(200)                   :: errorMessage
#ifdef DOUBLE_PRECISION_COMPLEX
1053
      call timer%start("elpa_invert_trm_complex_double")
1054
#else
1055
      call timer%start("elpa_invert_trm_complex_single")
1056
#endif
1057
       call timer%start("mpi_communication")
1058
1059
1060
1061
       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
1062
       call timer%stop("mpi_communication")
1063
1064
1065
1066
1067
1068
1069
       success = .true.

       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a

       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1070
         print *,"elpa_invert_trm_complex: error when allocating tmp1 "//errorMessage
1071
1072
1073
1074
1075
         stop
       endif

       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1076
         print *,"elpa_invert_trm_complex: error when allocating tmp2 "//errorMessage
1077
1078
1079
1080
1081
1082
1083
1084
         stop
       endif

       tmp1 = 0
       tmp2 = 0

       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1085
         print *,"elpa_invert_trm_complex: error when allocating tmat1 "//errorMessage
1086
1087
1088
1089
1090
         stop
       endif

       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
1091
         print *,"elpa_invert_trm_complex: error when allocating tmat2 "//errorMessage
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
         stop
       endif

       tmat1 = 0
       tmat2 = 0

       ns = ((na-1)/nblk)*nblk + 1

       do n = ns,1,-nblk

         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)

         nb = nblk
         if (na-n+1 < nblk) nb = na-n+1

         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)

         if (my_prow==prow(n, nblk, np_rows)) then

           if (my_pcol==pcol(n, nblk, np_cols)) then
Andreas Marek's avatar