elpa2_compute_real_template.X90 22.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
46

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

64
65
66
67
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa2_bandred_template.X90"
#undef REALCASE
68
69
70
71
#include "elpa2_symm_matrix_allreduce_real_template.X90"
#include "elpa2_trans_ev_band_to_full_real_template.X90"
#include "elpa2_tridiag_band_real_template.X90"
#include "elpa2_trans_ev_tridi_to_band_real_template.X90"
72
73
74



75
    subroutine band_band_real_PRECISION(na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, mpi_comm)
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
    !-------------------------------------------------------------------------------
    ! band_band_real:
    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
    ! zero entries, if this
    ! requirement doesn't hold)
    !
    !  na          Order of matrix
    !
    !  nb          Semi bandwidth of original matrix
    !
    !  nb2         Semi bandwidth of target matrix
    !
    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
    !              are located on rank n.
    !
    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
    !              on rank n.
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
    !  mpi_comm
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
      use timings
106
107
#else
     use timings_dummy
108
#endif
109
      use elpa2_workload
110
111
112
      use precision
      implicit none

Andreas Marek's avatar
Andreas Marek committed
113
      integer(kind=ik), intent(in)             :: na, nb, nbCol, nb2, nb2Col, mpi_comm
114
115
116
117
118
      real(kind=REAL_DATATYPE), intent(inout)  :: ab(2*nb,nbCol) ! removed assumed size
      real(kind=REAL_DATATYPE), intent(inout)  :: ab2(2*nb2,nb2Col) ! removed assumed size
      real(kind=REAL_DATATYPE), intent(out)    :: d(na), e(na) ! set only on PE 0

      real(kind=REAL_DATATYPE)                 :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
Andreas Marek's avatar
Andreas Marek committed
119
                                                  tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
120
121

      real(kind=REAL_DATATYPE)                 :: work(nb*nb2), work2(nb2*nb2)
Andreas Marek's avatar
Andreas Marek committed
122
123
124
125
126
127
128
129
      integer(kind=ik)                         :: lwork, info

      integer(kind=ik)                         :: istep, i, n, dest
      integer(kind=ik)                         :: n_off, na_s
      integer(kind=ik)                         :: my_pe, n_pes, mpierr
      integer(kind=ik)                         :: nblocks_total, nblocks
      integer(kind=ik)                         :: nblocks_total2, nblocks2
      integer(kind=ik)                         :: ireq_ab, ireq_hv
130
#ifdef WITH_MPI
131
!      integer(kind=ik)                         :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
132
#endif
133
!      integer(kind=ik), allocatable            :: mpi_statuses(:,:)
Andreas Marek's avatar
Andreas Marek committed
134
      integer(kind=ik), allocatable            :: block_limits(:), block_limits2(:), ireq_ab2(:)
135

Andreas Marek's avatar
Andreas Marek committed
136
137
138
      integer(kind=ik)                         :: j, nc, nr, ns, ne, iblk
      integer(kind=ik)                         :: istat
      character(200)                           :: errorMessage
139

140
      call timer%start("band_band_real" // PRECISION_SUFFIX)
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
!      if (na .lt. 2*nb) then
!        print *,"na lt 2*nb ",na,2*nb
!        stop
!      endif
!      if (na .lt. 2*nb2) then
!        print *,"na lt 2*nb2 ",na,2*nb2
!        stop
!      endif
!      if (na .lt. nbCol) then
!        print *,"na lt nbCol ",na,nbCol
!        stop
!      endif
!      if (na .lt. nb2Col) then
!        print *,"na lt nb2Col ",na,nb2Col
!        stop
!      endif

158
      call timer%start("mpi_communication")
159
160
      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
      call mpi_comm_size(mpi_comm,n_pes,mpierr)
161
162
      call timer%stop("mpi_communication")

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
      ! Total number of blocks in the band:
      nblocks_total = (na-1)/nb + 1
      nblocks_total2 = (na-1)/nb2 + 1

      ! Set work distribution
      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits "//errorMessage
        stop
      endif
      call divide_band(nblocks_total, n_pes, block_limits)

      allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits2 "//errorMessage
        stop
      endif

      call divide_band(nblocks_total2, n_pes, block_limits2)

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)

      allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating ireq_ab2 "//errorMessage
        stop
      endif

#ifdef WITH_MPI
194
195
      call timer%start("mpi_communication")

196
197
198
199
200
      ireq_ab2 = MPI_REQUEST_NULL

      if (nb2>1) then
        do i=0,nblocks2-1

201
          call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, MPI_REAL_PRECISION, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr)
202
203
        enddo
      endif
204
205
      call timer%stop("mpi_communication")

206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#else /* WITH_MPI */
      ! carefull the "recieve" has to be done at the corresponding send or wait
!      if (nb2>1) then
!        do i=0,nblocks2-1
!          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
!        enddo
!      endif

#endif /* WITH_MPI */
      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb
      lwork = nb*nb2
      dest = 0
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first nb2 columns to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        do i=1,nb2
          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
        enddo
#ifdef WITH_MPI
235
        call timer%start("mpi_communication")
236

237
        call mpi_isend(ab_s, (nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
238
        call timer%stop("mpi_communication")
239
240
241
242
243
244
245
246
#endif /* WITH_MPI */
      endif

      do istep=1,na/nb2

        if (my_pe==0) then

          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
247
248
          hv(:,:) = CONST_0_0
          tau(:) = CONST_0_0
249
250
251
252
253
254

          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
          if (istep < na/nb2) then

            ! Transform first block column of remaining matrix
255
	    call timer%start("blas")
256
            call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
257
	    call timer%stop("blas")
258
259

            do i=1,nb2
260
              hv(i,i) = CONST_1_0
261
              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
262
              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = CONST_0_0
263
264
265
266
267
268
269
270
            enddo

          endif

          if (nb2==1) then
            d(istep) = ab(1,na_s-n_off)
            e(istep) = ab(2,na_s-n_off)
            if (istep == na) then
271
              e(na) = CONST_0_0
272
273
            endif
          else
274
            ab_s2 = CONST_0_0
275
276
277
278
279
            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
            if (block_limits2(dest+1)<istep) then
              dest = dest+1
            endif
#ifdef WITH_MPI
280
            call timer%start("mpi_communication")
281
            call mpi_send(ab_s2, 2*nb2*nb2, MPI_REAL_PRECISION, dest, 3, mpi_comm, mpierr)
282
            call timer%stop("mpi_communication")
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

#else /* WITH_MPI */
            ! do irecv here
            if (nb2>1) then
              do i= 0,nblocks2-1
                ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
              enddo
            endif
#endif /* WITH_MPI */

          endif

        else
          if (na>na_s+nb2-1) then
            ! Receive Householder vectors from previous task, from PE owning subdiagonal
#ifdef WITH_MPI
299
            call timer%start("mpi_communication")
300
            call mpi_recv(hv, nb*nb2, MPI_REAL_PRECISION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
301
            call timer%stop("mpi_communication")
302
303
304
305
306
307
308

#else /* WITH_MPI */
           hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
#endif /* WITH_MPI */

            do i=1,nb2
              tau(i) = hv(i,i)
309
              hv(i,i) = CONST_1_0
310
311
312
313
314
315
316
            enddo
          endif
        endif

        na_s = na_s+nb2
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
317
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0
318
319
320
321
322
323
324
325
326
327
328
329
          n_off = n_off + nb
        endif

        do iblk=1,nblocks
          ns = na_s + (iblk-1)*nb - n_off ! first column in block
          ne = ns+nb-nb2                    ! last column in block

          if (ns+n_off>na) exit

            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
330
            call wy_gen_PRECISION(nc,nb2,w,hv,tau,work,nb)
331
332
333
334

            if (iblk==nblocks .and. nc==nb) then
              !request last nb2 columns
#ifdef WITH_MPI
335
              call timer%start("mpi_communication")
336
              call mpi_recv(ab_r,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
337
              call timer%stop("mpi_communication")
338
339
340
341
342
343
344
345

#else /* WITH_MPI */
             ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
#endif /* WITH_MPI */
              do i=1,nb2
                ab(1:nb+1,ne+i-1) = ab_r(:,i)
              enddo
            endif
346
347
            hv_new(:,:) = CONST_0_0 ! Needed, last rows must be 0 for nr < nb
            tau_new(:) = CONST_0_0
348
349

            if (nr>0) then
350
              call wy_right_PRECISION(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
351
	      call timer%start("blas")
352
              call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
353
	      call timer%stop("blas")
354
              do i=1,nb2
355
                hv_new(i,i) = CONST_1_0
356
                hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
357
                ab(nb+2:,ns+i-1) = CONST_0_0
358
359
360
361
362
              enddo

              !send hh-vector
              if (iblk==nblocks) then
#ifdef WITH_MPI
363
364
                call timer%start("mpi_communication")

365
                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
366
367
                call timer%stop("mpi_communication")

368
369
370
371
372
373
#endif
                hv_s = hv_new
                do i=1,nb2
                  hv_s(i,i) = tau_new(i)
                enddo
#ifdef WITH_MPI
374
                call timer%start("mpi_communication")
375
                call mpi_isend(hv_s,nb*nb2, MPI_REAL_PRECISION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
376
                call timer%stop("mpi_communication")
377
378
379
380
381
382
383

#else /* WITH_MPI */

#endif /* WITH_MPI */
              endif
            endif

384
            call wy_symm_PRECISION(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
385
386
387
388

            if (my_pe>0 .and. iblk==1) then
              !send first nb2 columns to previous PE
#ifdef WITH_MPI
389
390
              call timer%start("mpi_communication")

391
              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
392
393
              call timer%stop("mpi_communication")

394
395
396
397
398
#endif
              do i=1,nb2
                ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
              enddo
#ifdef WITH_MPI
399
              call timer%start("mpi_communication")
400
              call mpi_isend(ab_s,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
401
              call timer%stop("mpi_communication")
402
403
404
405
406
407
408

#else /* WITH_MPI */

#endif /* WITH_MPI */
            endif

            if (nr>0) then
409
410
              call wy_gen_PRECISION(nr,nb2,w_new,hv_new,tau_new,work,nb)
              call wy_left_PRECISION(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
411
412
413
414
415
416
417
418
419
420
            endif

            ! Use new HH vector for the next block
            hv(:,:) = hv_new(:,:)
            tau = tau_new
          enddo
        enddo

        ! Finish the last outstanding requests
#ifdef WITH_MPI
421
422
         call timer%start("mpi_communication")

423
424
425
426
427
428
429
        call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
        call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
!        allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error allocating mpi_statuses "//errorMessage
!          stop
!        endif
430

431
432
433
434
435
436
        call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
!        deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error deallocating mpi_statuses "//errorMessage
!          stop
!        endif
437
438

        call mpi_barrier(mpi_comm,mpierr)
439
440
        call timer%stop("mpi_communication")

441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#endif /* WITH_MPI */

        deallocate(block_limits, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits "//errorMessage
          stop
        endif

        deallocate(block_limits2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits2 "//errorMessage
          stop
        endif

        deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating ireq_ab2 "//errorMessage
          stop
        endif

461
        call timer%stop("band_band_real" // PRECISION_SUFFIX)
462
463
464

    end subroutine

465
    subroutine wy_gen_PRECISION(n, nb, W, Y, tau, mem, lda)
466
467
468

#ifdef HAVE_DETAILED_TIMINGS
   use timings
469
470
#else
   use timings_dummy
471
472
473
#endif
      use precision
      implicit none
474
475
476
477
478
479
480
      integer(kind=ik), intent(in)            :: n      !length of householder-vectors
      integer(kind=ik), intent(in)            :: nb     !number of householder-vectors
      integer(kind=ik), intent(in)            :: lda        !leading dimension of Y and W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(lda,nb)  !matrix containing nb householder-vectors of length b
      real(kind=REAL_DATATYPE), intent(in)    :: tau(nb)    !tau values
      real(kind=REAL_DATATYPE), intent(out)   :: W(lda,nb)  !output matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: mem(nb)    !memory for a temporary matrix of size nb
481
482
483

      integer(kind=ik)             :: i

484
   call timer%start("wy_gen" // PRECISION_SUFFIX)
485
486
487
488

   W(1:n,1) = tau(1)*Y(1:n,1)
   do i=2,nb
     W(1:n,i) = tau(i)*Y(1:n,i)
489
     call timer%start("blas")
490
491
     call PRECISION_GEMV('T', n, i-1,  CONST_1_0, Y, lda, W(1,i), 1, CONST_0_0, mem,1)
     call PRECISION_GEMV('N', n, i-1, -CONST_1_0, W, lda, mem, 1, CONST_1_0, W(1,i),1)
492
     call timer%stop("blas")
493
   enddo
494
   call timer%stop("wy_gen" // PRECISION_SUFFIX)
495
496
    end subroutine

497
    subroutine wy_left_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
498
499
500

#ifdef HAVE_DETAILED_TIMINGS
      use timings
501
502
#else
      use timings_dummy
503
504
505
#endif
      use precision
      implicit none
506
507
508
509
510
511
512
513
514
      integer(kind=ik), intent(in)            :: n      !width of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed   ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(m,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb)  !memory for a temporary matrix of size n x nb
515

516
   call timer%start("wy_left" // PRECISION_SUFFIX)
517
   call timer%start("blas")
518
519
   call PRECISION_GEMM('T', 'N', nb, n, m, CONST_1_0, W, lda2, A, lda, CONST_0_0, mem, nb)
   call PRECISION_GEMM('N', 'N', m, n, nb, -CONST_1_0, Y, lda2, mem, nb, CONST_1_0, A, lda)
520
   call timer%stop("blas")
521
   call timer%stop("wy_left" // PRECISION_SUFFIX)
522
523
    end subroutine

524
    subroutine wy_right_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
525
526
527

#ifdef HAVE_DETAILED_TIMINGS
      use timings
528
529
#else
      use timings_dummy
530
531
532
#endif
      use precision
      implicit none
533
534
535
536
537
538
539
540
541
      integer(kind=ik), intent(in)            :: n      !height of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(m,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb)  !memory for a temporary matrix of size n x nb
542
543


544
   call timer%start("wy_right" // PRECISION_SUFFIX)
545
   call timer%start("blas")
546
547
   call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
   call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
548
   call timer%stop("blas")
549
   call timer%stop("wy_right" // PRECISION_SUFFIX)
550
551
552

    end subroutine

553
    subroutine wy_symm_PRECISION(n, nb, A, lda, W, Y, mem, mem2, lda2)
554
555
556

#ifdef HAVE_DETAILED_TIMINGS
      use timings
557
558
#else
      use timings_dummy
559
560
561
#endif
      use precision
      implicit none
562
563
564
565
566
567
568
569
570
      integer(kind=ik), intent(in)            :: n      !width/heigth of the matrix A; length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(n,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(n,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE)                :: mem(n,nb)  !memory for a temporary matrix of size n x nb
      real(kind=REAL_DATATYPE)                :: mem2(nb,nb)    !memory for a temporary matrix of size nb x nb
571

572
   call timer%start("wy_symm" // PRECISION_SUFFIX)
573
   call timer%start("blas")
574
575
576
577
   call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
   call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
   call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
   call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
578
   call timer%stop("blas")
579
   call timer%stop("wy_symm" // PRECISION_SUFFIX)
580
581

    end subroutine
582