elpa2_compute_real_template.F90 25.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
Andreas Marek's avatar
Andreas Marek committed
52 53
!
! Author: Andreas Marek, MPCDF
54 55
#endif

56
#include "../general/sanity.F90"
57

58
#if REALCASE == 1
59
#include "../general/error_checking.inc"
60 61 62
#endif


63 64
#define REALCASE 1
#undef COMPLEXCASE
65
#include "elpa2_bandred_template.F90"
66
#define REALCASE 1
67
#undef SKEW_SYMMETRIC_BUILD
68
#include "elpa2_symm_matrix_allreduce_real_template.F90"
69 70
#ifdef HAVE_SKEWSYMMETRIC
#define SKEW_SYMMETRIC_BUILD
71
#include "elpa2_symm_matrix_allreduce_real_template.F90"
72
#undef SKEW_SYMMETRIC_BUILD
73
#endif
74
#undef REALCASE
75
#define REALCASE 1
76 77 78
#include "elpa2_trans_ev_band_to_full_template.F90"
#include "elpa2_tridiag_band_template.F90"
#include "elpa2_trans_ev_tridi_to_band_template.F90"
79 80 81



82 83
    subroutine band_band_real_&
&PRECISION &
84
                  (obj, na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, communicator)
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
    !-------------------------------------------------------------------------------
    ! band_band_real:
    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
    ! zero entries, if this
    ! requirement doesn't hold)
    !
    !  na          Order of matrix
    !
    !  nb          Semi bandwidth of original matrix
    !
    !  nb2         Semi bandwidth of target matrix
    !
    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
    !              are located on rank n.
    !
    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
    !              on rank n.
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
    !
110
    !  communicator
111 112
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
113
      use elpa_abstract_impl
114
      use elpa2_workload
115 116
      use elpa_blas_interfaces

117 118
      use precision
      implicit none
119
#include "../general/precision_kinds.F90"
120
      class(elpa_abstract_impl_t), intent(inout) :: obj
121 122 123 124
      integer(kind=ik), intent(in)               :: na, nb, nbCol, nb2, nb2Col, communicator
      real(kind=rk), intent(inout)               :: ab(2*nb,nbCol) ! removed assumed size
      real(kind=rk), intent(inout)               :: ab2(2*nb2,nb2Col) ! removed assumed size
      real(kind=rk), intent(out)                 :: d(na), e(na) ! set only on PE 0
125

126
      real(kind=rk)                              :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
Andreas Marek's avatar
Andreas Marek committed
127
                                                  tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
128

129
      real(kind=rk)                              :: work(nb*nb2), work2(nb2*nb2)
Andreas Marek's avatar
Andreas Marek committed
130
      integer(kind=ik)                         :: lwork, info
Andreas Marek's avatar
Andreas Marek committed
131
      integer(kind=BLAS_KIND)                  :: infoBLAS
Andreas Marek's avatar
Andreas Marek committed
132 133 134

      integer(kind=ik)                         :: istep, i, n, dest
      integer(kind=ik)                         :: n_off, na_s
135 136
      integer(kind=ik)                         :: my_pe, n_pes
      integer(kind=MPI_KIND)                   :: my_peMPI, n_pesMPI, mpierr
Andreas Marek's avatar
Andreas Marek committed
137 138
      integer(kind=ik)                         :: nblocks_total, nblocks
      integer(kind=ik)                         :: nblocks_total2, nblocks2
139
      integer(kind=MPI_KIND)                   :: ireq_ab, ireq_hv
140
#ifdef WITH_MPI
141
!      integer(kind=ik)                         :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
142
#endif
143
!      integer(kind=ik), allocatable            :: mpi_statuses(:,:)
144 145
      integer(kind=ik), allocatable            :: block_limits(:), block_limits2(:)
      integer(kind=MPI_KIND), allocatable      :: ireq_ab2(:)
146

Andreas Marek's avatar
Andreas Marek committed
147 148 149
      integer(kind=ik)                         :: j, nc, nr, ns, ne, iblk
      integer(kind=ik)                         :: istat
      character(200)                           :: errorMessage
150

151
      call obj%timer%start("band_band_real" // PRECISION_SUFFIX)
152

153
      call obj%timer%start("mpi_communication")
154 155 156 157 158
      call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
      call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)

      my_pe = int(my_peMPI,kind=c_int)
      n_pes = int(n_pesMPI,kind=c_int)
159
      call obj%timer%stop("mpi_communication")
160

161 162 163 164 165 166 167 168
      ! Total number of blocks in the band:
      nblocks_total = (na-1)/nb + 1
      nblocks_total2 = (na-1)/nb2 + 1

      ! Set work distribution
      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
169
        stop 1
170
      endif
171
      call divide_band(obj, nblocks_total, n_pes, block_limits)
172 173 174 175

      allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating block_limits2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
176
        stop 1
177 178
      endif

179
      call divide_band(obj, nblocks_total2, n_pes, block_limits2)
180 181 182 183 184 185 186 187

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)

      allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"error allocating ireq_ab2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
188
        stop 1
189 190 191
      endif

#ifdef WITH_MPI
192
      call obj%timer%start("mpi_communication")
193

194 195 196 197 198
      ireq_ab2 = MPI_REQUEST_NULL

      if (nb2>1) then
        do i=0,nblocks2-1

199 200
          call mpi_irecv(ab2(1,i*nb2+1), int(2*nb2*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, &
                         0_MPI_KIND, 3_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab2(i+1), mpierr)
201 202
        enddo
      endif
203
      call obj%timer%stop("mpi_communication")
204

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
#else /* WITH_MPI */
      ! carefull the "recieve" has to be done at the corresponding send or wait
!      if (nb2>1) then
!        do i=0,nblocks2-1
!          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
!        enddo
!      endif

#endif /* WITH_MPI */
      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb
      lwork = nb*nb2
      dest = 0
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first nb2 columns to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        do i=1,nb2
          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
        enddo
#ifdef WITH_MPI
234
        call obj%timer%start("mpi_communication")
235

236 237
        call mpi_isend(ab_s, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
                       1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
238
        call obj%timer%stop("mpi_communication")
239 240 241 242 243 244 245 246
#endif /* WITH_MPI */
      endif

      do istep=1,na/nb2

        if (my_pe==0) then

          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
247 248
          hv(:,:) = 0.0_rk
          tau(:) = 0.0_rk
249 250 251 252 253 254

          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
          if (istep < na/nb2) then

            ! Transform first block column of remaining matrix
Andreas Marek's avatar
Retab  
Andreas Marek committed
255
      call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
256 257 258 259
            call PRECISION_GEQRF(int(n,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(1+nb2,na_s-n_off), &
                                 int(2*nb-1,kind=BLAs_KIND), tau, work, int(lwork,kind=BLAS_KIND), &
                                 infoBLAS)
      info = int(infoBLAS,kind=ik)
Andreas Marek's avatar
Retab  
Andreas Marek committed
260
      call obj%timer%stop("blas")
261 262

            do i=1,nb2
263
              hv(i,i) = 1.0_rk
264
              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
265
              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0.0_rk
266 267 268 269 270 271 272 273
            enddo

          endif

          if (nb2==1) then
            d(istep) = ab(1,na_s-n_off)
            e(istep) = ab(2,na_s-n_off)
            if (istep == na) then
274
              e(na) = 0.0_rk
275 276
            endif
          else
277
            ab_s2 = 0.0_rk
278 279 280 281 282
            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
            if (block_limits2(dest+1)<istep) then
              dest = dest+1
            endif
#ifdef WITH_MPI
283
            call obj%timer%start("mpi_communication")
284 285
            call mpi_send(ab_s2, int(2*nb2*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(dest,kind=MPI_KIND), &
                          3_MPI_KIND, int(communicator,kind=MPI_KIND), mpierr)
286
            call obj%timer%stop("mpi_communication")
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302

#else /* WITH_MPI */
            ! do irecv here
            if (nb2>1) then
              do i= 0,nblocks2-1
                ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
              enddo
            endif
#endif /* WITH_MPI */

          endif

        else
          if (na>na_s+nb2-1) then
            ! Receive Householder vectors from previous task, from PE owning subdiagonal
#ifdef WITH_MPI
303
            call obj%timer%start("mpi_communication")
304 305
            call mpi_recv(hv, int(nb*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
                          2_MPI_KIND, int(communicator,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
306
            call obj%timer%stop("mpi_communication")
307 308 309 310 311 312 313

#else /* WITH_MPI */
           hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
#endif /* WITH_MPI */

            do i=1,nb2
              tau(i) = hv(i,i)
314
              hv(i,i) = 1.0_rk
315 316 317 318 319 320 321
            enddo
          endif
        endif

        na_s = na_s+nb2
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
322
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rk
323 324 325 326 327 328 329 330 331 332 333 334
          n_off = n_off + nb
        endif

        do iblk=1,nblocks
          ns = na_s + (iblk-1)*nb - n_off ! first column in block
          ne = ns+nb-nb2                    ! last column in block

          if (ns+n_off>na) exit

            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
Andreas Marek's avatar
Andreas Marek committed
335
            call wy_gen_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
336 337
      &PRECISION&
      &(obj,nc,nb2,w,hv,tau,work,nb)
338 339 340 341

            if (iblk==nblocks .and. nc==nb) then
              !request last nb2 columns
#ifdef WITH_MPI
342
              call obj%timer%start("mpi_communication")
343 344
              call mpi_recv(ab_r, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe+1,kind=MPI_KIND), &
                            1_MPI_KIND, int(communicator,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
345
              call obj%timer%stop("mpi_communication")
346 347 348 349 350 351 352 353

#else /* WITH_MPI */
             ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
#endif /* WITH_MPI */
              do i=1,nb2
                ab(1:nb+1,ne+i-1) = ab_r(:,i)
              enddo
            endif
354 355
            hv_new(:,:) = 0.0_rk ! Needed, last rows must be 0 for nr < nb
            tau_new(:) = 0.0_rk
356 357

            if (nr>0) then
Andreas Marek's avatar
Andreas Marek committed
358
              call wy_right_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
359 360 361
        &PRECISION&
        &(obj,nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
        call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
362 363 364 365
              call PRECISION_GEQRF(int(nr,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(nb+1,ns), &
                                   int(2*nb-1,kind=BLAS_KIND), tau_new, work, int(lwork,kind=BLAS_KIND), &
                                   infoBLAS)
        info = int(infoBLAS,kind=ik)
Andreas Marek's avatar
Retab  
Andreas Marek committed
366
        call obj%timer%stop("blas")
367
              do i=1,nb2
368
                hv_new(i,i) = 1.0_rk
369
                hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
370
                ab(nb+2:,ns+i-1) = 0.0_rk
371 372
              enddo

373
              !send hh-Vector
374 375
              if (iblk==nblocks) then
#ifdef WITH_MPI
376
                call obj%timer%start("mpi_communication")
377

378
                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
379
                call obj%timer%stop("mpi_communication")
380

381 382 383 384 385 386
#endif
                hv_s = hv_new
                do i=1,nb2
                  hv_s(i,i) = tau_new(i)
                enddo
#ifdef WITH_MPI
387
                call obj%timer%start("mpi_communication")
388 389
                call mpi_isend(hv_s, int(nb*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe+1,kind=MPI_KIND), &
                               2_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_hv, mpierr)
390
                call obj%timer%stop("mpi_communication")
391 392 393 394 395 396 397

#else /* WITH_MPI */

#endif /* WITH_MPI */
              endif
            endif

Andreas Marek's avatar
Andreas Marek committed
398
            call wy_symm_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
399 400
      &PRECISION&
      &(obj,nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
401 402 403 404

            if (my_pe>0 .and. iblk==1) then
              !send first nb2 columns to previous PE
#ifdef WITH_MPI
405
              call obj%timer%start("mpi_communication")
406

407
              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
408
              call obj%timer%stop("mpi_communication")
409

410 411 412 413 414
#endif
              do i=1,nb2
                ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
              enddo
#ifdef WITH_MPI
415
              call obj%timer%start("mpi_communication")
416 417
              call mpi_isend(ab_s, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
                             1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
418
              call obj%timer%stop("mpi_communication")
419 420 421 422 423 424 425

#else /* WITH_MPI */

#endif /* WITH_MPI */
            endif

            if (nr>0) then
Andreas Marek's avatar
Andreas Marek committed
426
              call wy_gen_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
427 428
        &PRECISION&
        &(obj,nr,nb2,w_new,hv_new,tau_new,work,nb)
Andreas Marek's avatar
Andreas Marek committed
429
              call wy_left_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
430 431
        &PRECISION&
        &(obj,nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
432 433
            endif

434
            ! Use new HH Vector for the next block
435 436 437 438 439 440 441
            hv(:,:) = hv_new(:,:)
            tau = tau_new
          enddo
        enddo

        ! Finish the last outstanding requests
#ifdef WITH_MPI
442
         call obj%timer%start("mpi_communication")
443

444 445 446 447 448
        call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
        call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
!        allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error allocating mpi_statuses "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
449
!          stop 1
450
!        endif
451

452 453 454 455
        call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
!        deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
!        if (istat .ne. 0) then
!          print *,"error deallocating mpi_statuses "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
456
!          stop 1
457
!        endif
458

459
        call mpi_barrier(int(communicator,kind=MPI_KIND) ,mpierr)
460
        call obj%timer%stop("mpi_communication")
461

462 463 464 465 466
#endif /* WITH_MPI */

        deallocate(block_limits, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
467
          stop 1
468 469 470 471 472
        endif

        deallocate(block_limits2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating block_limits2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
473
          stop 1
474 475 476 477 478
        endif

        deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"error deallocating ireq_ab2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
479
          stop 1
480 481
        endif

482
        call obj%timer%stop("band_band_real" // PRECISION_SUFFIX)
483 484 485

    end subroutine

Andreas Marek's avatar
Andreas Marek committed
486 487
    subroutine wy_gen_&
    &PRECISION&
488
    &(obj, n, nb, W, Y, tau, mem, lda)
489

490
      use elpa_abstract_impl
491 492
      use elpa_blas_interfaces

493 494
      use precision
      implicit none
495
#include "../general/precision_kinds.F90"
496
      class(elpa_abstract_impl_t), intent(inout) :: obj
497 498 499
      integer(kind=ik), intent(in)            :: n      !length of householder-vectors
      integer(kind=ik), intent(in)            :: nb     !number of householder-vectors
      integer(kind=ik), intent(in)            :: lda        !leading dimension of Y and W
500 501 502 503
      real(kind=rk), intent(in)               :: Y(lda,nb)  !matrix containing nb householder-vectors of length b
      real(kind=rk), intent(in)               :: tau(nb)    !tau values
      real(kind=rk), intent(out)              :: W(lda,nb)  !output matrix W
      real(kind=rk), intent(in)               :: mem(nb)    !memory for a temporary matrix of size nb
504

505
      integer(kind=ik)                        :: i
506

507
   call obj%timer%start("wy_gen" // PRECISION_SUFFIX)
508 509 510 511

   W(1:n,1) = tau(1)*Y(1:n,1)
   do i=2,nb
     W(1:n,i) = tau(i)*Y(1:n,i)
512
     call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
513 514 515 516
     call PRECISION_GEMV('T', int(n,kind=BLAS_KIND), int(i-1,kind=BLAS_KIND),  1.0_rk, Y, int(lda,kind=BLAS_KIND), &
                         W(1,i), 1_BLAS_KIND, 0.0_rk, mem, 1_BLAS_KIND)
     call PRECISION_GEMV('N', int(n,kind=BLAS_KIND), int(i-1,kind=BLAS_KIND), -1.0_rk, W, int(lda,kind=BLAS_KIND), &
                         mem, 1_BLAS_KIND, 1.0_rk, W(1,i), 1_BLAS_KIND)
517
     call obj%timer%stop("blas")
518
   enddo
519
   call obj%timer%stop("wy_gen" // PRECISION_SUFFIX)
520 521
    end subroutine

Andreas Marek's avatar
Andreas Marek committed
522 523
    subroutine wy_left_&
    &PRECISION&
524
    &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
525 526

      use precision
527
      use elpa_abstract_impl
528
      use elpa_blas_interfaces
529
      implicit none
530
#include "../general/precision_kinds.F90"
531
      class(elpa_abstract_impl_t), intent(inout) :: obj
532 533 534 535 536
      integer(kind=ik), intent(in)            :: n      !width of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
537 538 539 540
      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed   ! remove assumed size
      real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
      real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
541

542 543
   call obj%timer%start("wy_left" // PRECISION_SUFFIX)
   call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
544 545 546 547 548
   call PRECISION_GEMM('T', 'N', int(nb,kind=BLAS_KIND), int(n,kind=BLAS_KIND), int(m,kind=BLAS_KIND), &
                       1.0_rk, W, int(lda2,kind=BLAS_KIND), A, int(lda,kind=BLAS_KIND), 0.0_rk, mem, &
                       int(nb,kind=BLAS_KIND))
   call PRECISION_GEMM('N', 'N', int(m,kind=BLAS_KIND), int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
                       -1.0_rk, Y, int(lda2,kind=BLAS_KIND), mem, int(nb,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
549 550
   call obj%timer%stop("blas")
   call obj%timer%stop("wy_left" // PRECISION_SUFFIX)
551 552
    end subroutine

Andreas Marek's avatar
Andreas Marek committed
553 554
    subroutine wy_right_&
    &PRECISION&
555
    &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
556 557

      use precision
558
      use elpa_abstract_impl
559
      use elpa_blas_interfaces
560
      implicit none
561
#include "../general/precision_kinds.F90"
562
      class(elpa_abstract_impl_t), intent(inout) :: obj
563 564 565 566 567
      integer(kind=ik), intent(in)            :: n      !height of the matrix A
      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
568 569 570 571
      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
      real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
      real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
572 573


574 575
      call obj%timer%start("wy_right" // PRECISION_SUFFIX)
      call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
576 577 578 579
      call PRECISION_GEMM('N', 'N', int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(m,kind=BLAS_KIND), &
                          1.0_rk, A, int(lda,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem, int(n,kind=BLAS_KIND))
      call PRECISION_GEMM('N', 'T', int(n,kind=BLAS_KIND), int(m,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
                          -1.0_rk, mem, int(n,kind=BLAS_KIND), Y, int(lda2,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
580 581
      call obj%timer%stop("blas")
      call obj%timer%stop("wy_right" // PRECISION_SUFFIX)
582 583 584

    end subroutine

Andreas Marek's avatar
Andreas Marek committed
585 586
    subroutine wy_symm_&
    &PRECISION&
587
    &(obj, n, nb, A, lda, W, Y, mem, mem2, lda2)
588

589
      use elpa_abstract_impl
590 591
      use elpa_blas_interfaces

592 593
      use precision
      implicit none
594
#include "../general/precision_kinds.F90"
595
      class(elpa_abstract_impl_t), intent(inout) :: obj
596 597 598 599
      integer(kind=ik), intent(in)            :: n      !width/heigth of the matrix A; length of matrix W and Y
      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
600 601 602 603 604
      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
      real(kind=rk), intent(in)               :: W(n,nb)    !blocked transformation matrix W
      real(kind=rk), intent(in)               :: Y(n,nb)    !blocked transformation matrix Y
      real(kind=rk)                           :: mem(n,nb)  !memory for a temporary matrix of size n x nb
      real(kind=rk)                           :: mem2(nb,nb)    !memory for a temporary matrix of size nb x nb
605

606 607
      call obj%timer%start("wy_symm" // PRECISION_SUFFIX)
      call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
608 609 610 611 612 613 614 615 616
      call PRECISION_SYMM('L', 'L', int(n, kind=BLAS_KIND), int(nb,kind=BLAS_KIND), 1.0_rk, A, &
                          int(lda,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem, int(n,kind=BLAS_KIND))
      call PRECISION_GEMM('T', 'N', int(nb,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(n,kind=BLAS_KIND), &
                          1.0_rk, mem, int(n,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem2, &
                          int(nb,kind=BLAS_KIND))
      call PRECISION_GEMM('N', 'N', int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
                          -0.5_rk, Y, int(lda2,kind=BLAS_KIND), mem2, int(nb,kind=BLAS_KIND), 1.0_rk, mem, int(n,kind=BLAS_KIND))
      call PRECISION_SYR2K('L', 'N',int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -1.0_rk, Y, int(lda2,kind=BLAS_KIND), &
                           mem, int(n,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
617 618
      call obj%timer%stop("blas")
      call obj%timer%stop("wy_symm" // PRECISION_SUFFIX)
619 620

    end subroutine
621