elpa2_compute_real_template.X90 22.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
46

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

64
65
66
67
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa2_bandred_template.X90"
#undef REALCASE
68
#include "elpa2_symm_matrix_allreduce_real_template.X90"
69
70
71
#define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#undef REALCASE
72
73
#include "elpa2_tridiag_band_real_template.X90"
#include "elpa2_trans_ev_tridi_to_band_real_template.X90"
74
75
76



77
    subroutine band_band_real_PRECISION(na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, mpi_comm)
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    !-------------------------------------------------------------------------------
    ! band_band_real:
    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
    ! zero entries, if this
    ! requirement doesn't hold)
    !
    !  na          Order of matrix
    !
    !  nb          Semi bandwidth of original matrix
    !
    !  nb2         Semi bandwidth of target matrix
    !
    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
    !              are located on rank n.
    !
    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
    !              on rank n.
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
    !  mpi_comm
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
      use timings
108
109
#else
     use timings_dummy
110
#endif
111
      use elpa2_workload
112
113
114
      use precision
      implicit none

Andreas Marek's avatar
Andreas Marek committed
115
      integer(kind=ik), intent(in)             :: na, nb, nbCol, nb2, nb2Col, mpi_comm
116
117
118
119
120
      real(kind=REAL_DATATYPE), intent(inout)  :: ab(2*nb,nbCol) ! removed assumed size
      real(kind=REAL_DATATYPE), intent(inout)  :: ab2(2*nb2,nb2Col) ! removed assumed size
      real(kind=REAL_DATATYPE), intent(out)    :: d(na), e(na) ! set only on PE 0

      real(kind=REAL_DATATYPE)                 :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
Andreas Marek's avatar
Andreas Marek committed
121
                                                  tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
122
123

      real(kind=REAL_DATATYPE)                 :: work(nb*nb2), work2(nb2*nb2)
Andreas Marek's avatar
Andreas Marek committed
124
125
126
127
128
129
130
131
      integer(kind=ik)                         :: lwork, info

      integer(kind=ik)                         :: istep, i, n, dest
      integer(kind=ik)                         :: n_off, na_s
      integer(kind=ik)                         :: my_pe, n_pes, mpierr
      integer(kind=ik)                         :: nblocks_total, nblocks
      integer(kind=ik)                         :: nblocks_total2, nblocks2
      integer(kind=ik)                         :: ireq_ab, ireq_hv
132
#ifdef WITH_MPI
133
!      integer(kind=ik)                         :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
134
#endif
135
!      integer(kind=ik), allocatable            :: mpi_statuses(:,:)
Andreas Marek's avatar
Andreas Marek committed
136
      integer(kind=ik), allocatable            :: block_limits(:), block_limits2(:), ireq_ab2(:)
137

Andreas Marek's avatar
Andreas Marek committed
138
139
140
      integer(kind=ik)                         :: j, nc, nr, ns, ne, iblk
      integer(kind=ik)                         :: istat
      character(200)                           :: errorMessage
141

142
      call timer%start("band_band_real" // PRECISION_SUFFIX)
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
!      if (na .lt. 2*nb) then
!        print *,"na lt 2*nb ",na,2*nb
!        stop
!      endif
!      if (na .lt. 2*nb2) then
!        print *,"na lt 2*nb2 ",na,2*nb2
!        stop
!      endif
!      if (na .lt. nbCol) then
!        print *,"na lt nbCol ",na,nbCol
!        stop
!      endif
!      if (na .lt. nb2Col) then
!        print *,"na lt nb2Col ",na,nb2Col
!        stop
!      endif

160
      call timer%start("mpi_communication")
161
162
      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
      call mpi_comm_size(mpi_comm,n_pes,mpierr)
163
164
      call timer%stop("mpi_communication")

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
      ! Total number of blocks in the band:
      nblocks_total = (na-1)/nb + 1
      nblocks_total2 = (na-1)/nb2 + 1

      ! Set work distribution
      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits "//errorMessage
        stop
      endif
      call divide_band(nblocks_total, n_pes, block_limits)

      allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits2 "//errorMessage
        stop
      endif

      call divide_band(nblocks_total2, n_pes, block_limits2)

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)

      allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating ireq_ab2 "//errorMessage
        stop
      endif

#ifdef WITH_MPI
196
197
      call timer%start("mpi_communication")

198
199
200
201
202
      ireq_ab2 = MPI_REQUEST_NULL

      if (nb2>1) then
        do i=0,nblocks2-1

203
          call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, MPI_REAL_PRECISION, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr)
204
205
        enddo
      endif
206
207
      call timer%stop("mpi_communication")

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#else /* WITH_MPI */
      ! carefull the "recieve" has to be done at the corresponding send or wait
!      if (nb2>1) then
!        do i=0,nblocks2-1
!          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
!        enddo
!      endif

#endif /* WITH_MPI */
      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb
      lwork = nb*nb2
      dest = 0
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first nb2 columns to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        do i=1,nb2
          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
        enddo
#ifdef WITH_MPI
237
        call timer%start("mpi_communication")
238

239
        call mpi_isend(ab_s, (nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
240
        call timer%stop("mpi_communication")
241
242
243
244
245
246
247
248
#endif /* WITH_MPI */
      endif

      do istep=1,na/nb2

        if (my_pe==0) then

          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
249
250
          hv(:,:) = CONST_0_0
          tau(:) = CONST_0_0
251
252
253
254
255
256

          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
          if (istep < na/nb2) then

            ! Transform first block column of remaining matrix
257
	    call timer%start("blas")
258
            call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
259
	    call timer%stop("blas")
260
261

            do i=1,nb2
262
              hv(i,i) = CONST_1_0
263
              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
264
              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = CONST_0_0
265
266
267
268
269
270
271
272
            enddo

          endif

          if (nb2==1) then
            d(istep) = ab(1,na_s-n_off)
            e(istep) = ab(2,na_s-n_off)
            if (istep == na) then
273
              e(na) = CONST_0_0
274
275
            endif
          else
276
            ab_s2 = CONST_0_0
277
278
279
280
281
            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
            if (block_limits2(dest+1)<istep) then
              dest = dest+1
            endif
#ifdef WITH_MPI
282
            call timer%start("mpi_communication")
283
            call mpi_send(ab_s2, 2*nb2*nb2, MPI_REAL_PRECISION, dest, 3, mpi_comm, mpierr)
284
            call timer%stop("mpi_communication")
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

#else /* WITH_MPI */
            ! do irecv here
            if (nb2>1) then
              do i= 0,nblocks2-1
                ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
              enddo
            endif
#endif /* WITH_MPI */

          endif

        else
          if (na>na_s+nb2-1) then
            ! Receive Householder vectors from previous task, from PE owning subdiagonal
#ifdef WITH_MPI
301
            call timer%start("mpi_communication")
302
            call mpi_recv(hv, nb*nb2, MPI_REAL_PRECISION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
303
            call timer%stop("mpi_communication")
304
305
306
307
308
309
310

#else /* WITH_MPI */
           hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
#endif /* WITH_MPI */

            do i=1,nb2
              tau(i) = hv(i,i)
311
              hv(i,i) = CONST_1_0
312
313
314
315
316
317
318
            enddo
          endif
        endif

        na_s = na_s+nb2
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
319
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0
320
321
322
323
324
325
326
327
328
329
330
331
          n_off = n_off + nb
        endif

        do iblk=1,nblocks
          ns = na_s + (iblk-1)*nb - n_off ! first column in block
          ne = ns+nb-nb2                    ! last column in block

          if (ns+n_off>na) exit

            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
332
            call wy_gen_PRECISION(nc,nb2,w,hv,tau,work,nb)
333
334
335
336

            if (iblk==nblocks .and. nc==nb) then
              !request last nb2 columns
#ifdef WITH_MPI
337
              call timer%start("mpi_communication")
338
              call mpi_recv(ab_r,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
339
              call timer%stop("mpi_communication")
340
341
342
343
344
345
346
347

#else /* WITH_MPI */
             ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
#endif /* WITH_MPI */
              do i=1,nb2
                ab(1:nb+1,ne+i-1) = ab_r(:,i)
              enddo
            endif
348
349
            hv_new(:,:) = CONST_0_0 ! Needed, last rows must be 0 for nr < nb
            tau_new(:) = CONST_0_0
350
351

            if (nr>0) then
352
              call wy_right_PRECISION(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
353
	      call timer%start("blas")
354
              call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
355
	      call timer%stop("blas")
356
              do i=1,nb2
357
                hv_new(i,i) = CONST_1_0
358
                hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
359
                ab(nb+2:,ns+i-1) = CONST_0_0
360
361
362
363
364
              enddo

              !send hh-vector
              if (iblk==nblocks) then
#ifdef WITH_MPI
365
366
                call timer%start("mpi_communication")

367
                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
368
369
                call timer%stop("mpi_communication")

370
371
372
373
374
375
#endif
                hv_s = hv_new
                do i=1,nb2
                  hv_s(i,i) = tau_new(i)
                enddo
#ifdef WITH_MPI
376
                call timer%start("mpi_communication")
377
                call mpi_isend(hv_s,nb*nb2, MPI_REAL_PRECISION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
378
                call timer%stop("mpi_communication")
379
380
381
382
383
384
385

#else /* WITH_MPI */

#endif /* WITH_MPI */
              endif
            endif

386
            call wy_symm_PRECISION(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
387
388
389
390

            if (my_pe>0 .and. iblk==1) then
              !send first nb2 columns to previous PE
#ifdef WITH_MPI
391
392
              call timer%start("mpi_communication")

393
              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
394
395
              call timer%stop("mpi_communication")

396
397
398
399
400
#endif
              do i=1,nb2
                ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
              enddo
#ifdef WITH_MPI
401
              call timer%start("mpi_communication")
402
              call mpi_isend(ab_s,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
403
              call timer%stop("mpi_communication")
404
405
406
407
408
409
410

#else /* WITH_MPI */

#endif /* WITH_MPI */
            endif

            if (nr>0) then
411
412
              call wy_gen_PRECISION(nr,nb2,w_new,hv_new,tau_new,work,nb)
              call wy_left_PRECISION(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
413
414
415
416
417
418
419
420
421
422
            endif

            ! Use new HH vector for the next block
            hv(:,:) = hv_new(:,:)
            tau = tau_new
          enddo
        enddo

        ! Finish the last outstanding requests
#ifdef WITH_MPI
423
424
         call timer%start("mpi_communication")

425
426
427
428
429
430
431
        call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
        call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
!        allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error allocating mpi_statuses "//errorMessage
!          stop
!        endif
432

433
434
435
436
437
438
        call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
!        deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error deallocating mpi_statuses "//errorMessage
!          stop
!        endif
439
440

        call mpi_barrier(mpi_comm,mpierr)
441
442
        call timer%stop("mpi_communication")

443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#endif /* WITH_MPI */

        deallocate(block_limits, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits "//errorMessage
          stop
        endif

        deallocate(block_limits2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits2 "//errorMessage
          stop
        endif

        deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating ireq_ab2 "//errorMessage
          stop
        endif

463
        call timer%stop("band_band_real" // PRECISION_SUFFIX)
464
465
466

    end subroutine

467
    subroutine wy_gen_PRECISION(n, nb, W, Y, tau, mem, lda)
468
469
470

#ifdef HAVE_DETAILED_TIMINGS
   use timings
471
472
#else
   use timings_dummy
473
474
475
#endif
      use precision
      implicit none
476
477
478
479
480
481
482
      integer(kind=ik), intent(in)            :: n      !length of householder-vectors
      integer(kind=ik), intent(in)            :: nb     !number of householder-vectors
      integer(kind=ik), intent(in)            :: lda        !leading dimension of Y and W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(lda,nb)  !matrix containing nb householder-vectors of length b
      real(kind=REAL_DATATYPE), intent(in)    :: tau(nb)    !tau values
      real(kind=REAL_DATATYPE), intent(out)   :: W(lda,nb)  !output matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: mem(nb)    !memory for a temporary matrix of size nb
483
484
485

      integer(kind=ik)             :: i

486
   call timer%start("wy_gen" // PRECISION_SUFFIX)
487
488
489
490

   W(1:n,1) = tau(1)*Y(1:n,1)
   do i=2,nb
     W(1:n,i) = tau(i)*Y(1:n,i)
491
     call timer%start("blas")
492
493
     call PRECISION_GEMV('T', n, i-1,  CONST_1_0, Y, lda, W(1,i), 1, CONST_0_0, mem,1)
     call PRECISION_GEMV('N', n, i-1, -CONST_1_0, W, lda, mem, 1, CONST_1_0, W(1,i),1)
494
     call timer%stop("blas")
495
   enddo
496
   call timer%stop("wy_gen" // PRECISION_SUFFIX)
497
498
    end subroutine

499
    subroutine wy_left_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
500
501
502

#ifdef HAVE_DETAILED_TIMINGS
      use timings
503
504
#else
      use timings_dummy
505
506
507
#endif
      use precision
      implicit none
508
509
510
511
512
513
514
515
516
      integer(kind=ik), intent(in)            :: n      !width of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed   ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(m,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb)  !memory for a temporary matrix of size n x nb
517

518
   call timer%start("wy_left" // PRECISION_SUFFIX)
519
   call timer%start("blas")
520
521
   call PRECISION_GEMM('T', 'N', nb, n, m, CONST_1_0, W, lda2, A, lda, CONST_0_0, mem, nb)
   call PRECISION_GEMM('N', 'N', m, n, nb, -CONST_1_0, Y, lda2, mem, nb, CONST_1_0, A, lda)
522
   call timer%stop("blas")
523
   call timer%stop("wy_left" // PRECISION_SUFFIX)
524
525
    end subroutine

526
    subroutine wy_right_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
527
528
529

#ifdef HAVE_DETAILED_TIMINGS
      use timings
530
531
#else
      use timings_dummy
532
533
534
#endif
      use precision
      implicit none
535
536
537
538
539
540
541
542
543
      integer(kind=ik), intent(in)            :: n      !height of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(m,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb)  !memory for a temporary matrix of size n x nb
544
545


546
   call timer%start("wy_right" // PRECISION_SUFFIX)
547
   call timer%start("blas")
548
549
   call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
   call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
550
   call timer%stop("blas")
551
   call timer%stop("wy_right" // PRECISION_SUFFIX)
552
553
554

    end subroutine

555
    subroutine wy_symm_PRECISION(n, nb, A, lda, W, Y, mem, mem2, lda2)
556
557
558

#ifdef HAVE_DETAILED_TIMINGS
      use timings
559
560
#else
      use timings_dummy
561
562
563
#endif
      use precision
      implicit none
564
565
566
567
568
569
570
571
572
      integer(kind=ik), intent(in)            :: n      !width/heigth of the matrix A; length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
      real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=REAL_DATATYPE), intent(in)    :: W(n,nb)    !blocked transformation matrix W
      real(kind=REAL_DATATYPE), intent(in)    :: Y(n,nb)    !blocked transformation matrix Y
      real(kind=REAL_DATATYPE)                :: mem(n,nb)  !memory for a temporary matrix of size n x nb
      real(kind=REAL_DATATYPE)                :: mem2(nb,nb)    !memory for a temporary matrix of size nb x nb
573

574
   call timer%start("wy_symm" // PRECISION_SUFFIX)
575
   call timer%start("blas")
576
577
578
579
   call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
   call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
   call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
   call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
580
   call timer%stop("blas")
581
   call timer%stop("wy_symm" // PRECISION_SUFFIX)
582
583

    end subroutine
584