elpa2_tridiag_band_template.F90 50.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

54
#if REALCASE == 1
55
    subroutine tridiag_band_real_&
56 57
#endif
#if COMPLEXCASE == 1
58
    subroutine tridiag_band_complex_&
59
#endif
60
    &PRECISION &
61
    (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
62 63 64 65 66 67
#if REALCASE == 1
    hh_trans_real, &
#endif
#if COMPLEXCASE == 1
    hh_trans_complex, &
#endif
68
    mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
69 70 71 72 73 74 75 76 77 78
    !-------------------------------------------------------------------------------
    ! tridiag_band_real/complex:
    ! Reduces a real symmetric band matrix to tridiagonal form
    !
    !  na          Order of matrix a
    !
    !  nb          Semi bandwith
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
Andreas Marek's avatar
Andreas Marek committed
79
    !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
80 81 82 83 84 85 86 87 88 89 90 91 92
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a
    !
    ! hh_trans_real/complex : housholder vectors
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
93
    !  communicator
94 95
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
96
      use elpa_abstract_impl
97 98
      use elpa2_workload
      use precision
Andreas Marek's avatar
Andreas Marek committed
99
      use iso_c_binding
100
      use redist
101 102
      implicit none

Andreas Marek's avatar
Andreas Marek committed
103
      class(elpa_abstract_impl_t), intent(inout)   :: obj
104
      logical, intent(in)                          :: useGPU, wantDebug
105
      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
106 107
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
108
      real(kind=REAL_DATATYPE), intent(in)         :: aMatrix(lda,*)
109
#else
Andreas Marek's avatar
Andreas Marek committed
110
      real(kind=REAL_DATATYPE), intent(in)         :: aMatrix(lda,matrixCols)
111
#endif
112
#endif /* REALCASE */
113 114
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
115
      complex(kind=COMPLEX_DATATYPE),intent(in)    :: aMatrix(lda,*)
116
#else
Andreas Marek's avatar
Andreas Marek committed
117
      complex(kind=COMPLEX_DATATYPE), intent(in)   :: aMatrix(lda,matrixCols)
118
#endif
119
#endif /* COMPLEXCASE */
Andreas Marek's avatar
Andreas Marek committed
120
      integer(kind=c_intptr_t)                     :: a_dev
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
      real(kind=REAL_DATATYPE), intent(out)        :: d(na), e(na) ! set only on PE 0
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out), &
          allocatable                              :: hh_trans_real(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(inout), &
          allocatable                              :: hh_trans_complex(:,:)
#endif

      real(kind=REAL_DATATYPE)                     :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      real(kind=REAL_DATATYPE)                     :: hd(nb), hs(nb)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)               :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      complex(kind=COMPLEX_DATATYPE)               :: hd(nb), hs(nb)
#endif

      integer(kind=ik)                             :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
      integer(kind=ik)                             :: my_pe, n_pes, mpierr
      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
      integer(kind=ik)                             :: ireq_ab, ireq_hv
      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
!      integer(kind=ik)                            :: my_mpi_status(MPI_STATUS_SIZE)
#endif
!      integer(kind=ik), allocatable               :: mpi_statuses(:,:), global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: omp_block_limits(:)
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable        :: hv_t(:,:), tau_t(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable  :: hv_t(:,:), tau_t(:)
#endif
      integer(kind=ik)                             :: omp_get_max_threads
#endif /* WITH_OPENMP */
      integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
      integer(kind=ik), allocatable                :: block_limits(:)
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable  :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
#endif
      integer                                      :: istat
      character(200)                               :: errorMessage

#ifndef WITH_MPI
      integer(kind=ik)                             :: startAddr
#endif
177
      call obj%timer%start("tridiag_band_&
178
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
179
      &" // &
180 181
      &PRECISION_SUFFIX &
      )
182
      if (wantDebug) call obj%timer%start("mpi_communication")
183 184
      call mpi_comm_rank(communicator,my_pe,mpierr)
      call mpi_comm_size(communicator,n_pes,mpierr)
185 186 187 188 189

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
190
      if (wantDebug) call obj%timer%stop("mpi_communication")
191 192 193 194 195 196

      ! Get global_id mapping 2D procssor coordinates to global id

      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
197 198
  &MATH_DATATYPE&
  &: error when allocating global_id "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
199
        stop 1
200 201 202 203 204 205 206 207 208
      endif

      global_id(:,:) = 0
      global_id(my_prow, my_pcol) = my_pe

#ifdef WITH_OPENMP
      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
209 210
  &MATH_DATATYPE&
  &: error when allocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
211
        stop 1
212 213 214 215
      endif
#endif

#ifdef WITH_MPI
216
      if (wantDebug) call obj%timer%start("mpi_communication")
217
#ifndef WITH_OPENMP
218
      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
219 220
#else
      global_id_tmp(:,:) = global_id(:,:)
221
      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
222 223 224
      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
225 226
  &MATH_DATATYPE&
  &: error when deallocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
227
        stop 1
228 229
      endif
#endif /* WITH_OPENMP */
230
      if (wantDebug) call obj%timer%stop("mpi_communication")
231 232 233 234 235 236 237 238 239 240 241
#endif /* WITH_MPI */

      ! Total number of blocks in the band:

      nblocks_total = (na-1)/nb + 1

      ! Set work distribution

      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
242 243
  &MATH_DATATYPE&
  &: error when allocating block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
244
        stop 1
245 246
      endif

247
      call divide_band(obj,nblocks_total, n_pes, block_limits)
248 249 250 251 252 253 254 255 256

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)

      ! allocate the part of the band matrix which is needed by this PE
      ! The size is 1 block larger than needed to avoid extensive shifts
      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
257 258
  &MATH_DATATYPE&
  &: error when allocating ab"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
259
        stop 1
260 261 262 263 264 265 266 267
      endif

      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety

      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb

      ! Redistribute band in a to ab
Andreas Marek's avatar
Andreas Marek committed
268 269 270 271
      call redist_band_&
      &MATH_DATATYPE&
      &_&
      &PRECISION&
272
      &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
273

274 275 276 277 278 279
      ! Calculate the workload for each sweep in the back transformation
      ! and the space requirements to hold the HH vectors

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
280 281
  &MATH_DATATYPE&
  &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
282
        stop 1
283 284
      endif

285
      call determine_workload(obj,na, nb, np_rows, limits)
286 287 288 289 290 291
      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      do n = 1, nblocks_total
292
        call determine_workload(obj, nx, nb, np_rows, limits)
293 294
        local_size = limits(my_prow+1) - limits(my_prow)
        ! add to number of householder vectors
295
        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
296 297 298 299 300 301 302 303 304 305 306 307 308
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_hh_vecs = num_hh_vecs + local_size
          num_chunks  = num_chunks+1
        endif
        nx = nx - nb
      enddo

      ! Allocate space for HH vectors

#if REALCASE == 1
      allocate(hh_trans_real(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_real: error when allocating hh_trans_real"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
309
        stop 1
310 311 312 313 314 315
      endif
#endif
#if COMPLEXCASE == 1
      allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
316
        stop 1
317 318 319 320 321 322 323 324
      endif
#endif

      ! Allocate and init MPI requests

      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
325 326
  &MATH_DATATYPE&
  &: error when allocating ireq_hhr"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
327
        stop 1
328 329 330 331
      endif
      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
332 333
  &MATH_DATATYEP&
  &: error when allocating ireq_hhs"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
334
        stop 1
335 336 337 338 339 340 341
      endif

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      nt = 0
      do n = 1, nblocks_total
342
        call determine_workload(obj,nx, nb, np_rows, limits)
343 344 345 346
        local_size = limits(my_prow+1) - limits(my_prow)
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_chunks  = num_chunks+1
#ifdef WITH_MPI
347
          if (wantDebug) call obj%timer%start("mpi_communication")
348 349 350 351 352 353
#if REALCASE == 1
          call mpi_irecv(hh_trans_real(1,num_hh_vecs+1),          &
#endif
#if COMPLEXCASE == 1
          call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1),       &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
354
                   nb*local_size,                           &
355
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
356
       MPI_REAL_PRECISION,                     &
357 358 359 360
#endif
#if COMPLEXCASE == 1
                        MPI_COMPLEX_EXPLICIT_PRECISION,           &
#endif
361
                        nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
362
          if (wantDebug) call obj%timer%stop("mpi_communication")
363 364 365 366

#else /* WITH_MPI */
          ! carefull non-block recv data copy must be done at wait or send
          ! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
Andreas Marek's avatar
Retab  
Andreas Marek committed
367
    ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
#endif /* WITH_MPI */
          num_hh_vecs = num_hh_vecs + local_size
        endif
        nx = nx - nb
        if (n == block_limits(nt+1)) then
          nt = nt + 1
        endif
      enddo
#ifdef WITH_MPI
      ireq_hhs(:) = MPI_REQUEST_NULL
#endif
      ! Buffers for gathering/sending the HH vectors

      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
384 385
  &MATH_DATATYPE&
  &: error when allocating hh_gath"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
386
        stop 1
387 388 389 390 391
      endif

      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
392 393
  &MATH_DATATYPE&
  &: error when allocating hh_send"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
394
        stop 1
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
      endif

#if REALCASE == 1
      hh_gath(:,:,:) = CONST_0_0
      hh_send(:,:,:) = CONST_0_0
#endif
#if COMPLEXCASE == 1
      hh_gath(:,:,:) = CONST_COMPLEX_0_0
      hh_send(:,:,:) = CONST_COMPLEX_0_0
#endif
      ! Some counters

      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
410 411
  &MATH_DATATYPE&
  &: error when allocating hh_cnt"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
412
        stop 1
413 414 415 416 417
      endif

      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
418 419
  &MATH_DATATYPE&
  &: error when allocating hh_dst"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
420
        stop 1
421 422
      endif

423
      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
424 425 426 427 428 429 430 431 432 433
      hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! Limits for sending

      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
434 435
  &MATH_DATATYPE&
  &: error when allocating snd_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
436
        stop 1
437 438
      endif
      do iblk=1,nblocks
439
        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
      enddo

#ifdef WITH_OPENMP
      ! OpenMP work distribution:

      max_threads = 1
#if REALCASE == 1
      max_threads = omp_get_max_threads()
#endif
#if COMPLEXCASE == 1
!$      max_threads = omp_get_max_threads()
#endif
      ! For OpenMP we need at least 2 blocks for every thread
      max_threads = MIN(max_threads, nblocks/2)
      if (max_threads==0) max_threads = 1

      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
459 460
  &MATH_DATATYPE&
  &: error when allocating omp_block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
461
        stop 1
462 463 464
      endif

      ! Get the OpenMP block limits
465
      call divide_band(obj,nblocks, max_threads, omp_block_limits)
466 467 468 469

      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
470 471
  &MATH_DATATYPE&
  &: error when allocating hv_t, tau_t"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
472
        stop 1
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
      endif

#if REALCASE == 1
      hv_t = 0
      tau_t = 0
#endif
#if COMPLEXCASE == 1
      hv_t = CONST_COMPLEX_0_0
      tau_t = CONST_COMPLEX_0_0
#endif
#endif /* WITH_OPENMP */

      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first column to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
495
        if (wantDebug) call obj%timer%start("mpi_communication")
496 497
        call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
498
                 MPI_REAL_PRECISION,  &
499 500 501 502
#endif
#if COMPLEXCASE == 1
                       MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
503
           my_pe-1, 1, communicator, ireq_ab, mpierr)
504
        if (wantDebug) call obj%timer%stop("mpi_communication")
505 506 507 508 509
#endif /* WITH_MPI */
      endif

#ifndef WITH_MPI
#if REALCASE == 1
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
510
          startAddr = ubound(hh_trans_real,dim=2)
511 512 513 514
#endif
#if COMPLEXCASE == 1
          startAddr = ubound(hh_trans_complex,dim=2)
#endif
515
#endif /* WITH_MPI */
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555

#ifdef WITH_OPENMP
      do istep=1,na-1-block_limits(my_pe)*nb
#else
      do istep=1,na-1
#endif

        if (my_pe==0) then
          n = MIN(na-na_s,nb) ! number of rows to be reduced
#if REALCASE == 1
          hv(:) = CONST_0_0
          tau = CONST_0_0
#endif
#if COMPLEXCASE == 1
          hv(:) = CONST_COMPLEX_0_0
          tau = CONST_COMPLEX_0_0
#endif

          ! Transform first column of remaining matrix
#if REALCASE == 1
          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
#endif
#if COMPLEXCASE == 1
         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
         ! the last subdiagonal element a real number
#endif

#if REALCASE == 1
          if (istep < na-1) then
            ! Transform first column of remaining matrix
            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
          if (n<2) vnorm2 = 0. ! Safety only
556
#endif /* COMPLEXCASE */
557

558 559 560 561
#if REALCASE == 1
            call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
562
            call hh_transform_complex_&
563 564
#endif
&PRECISION &
565
                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
566 567 568 569 570 571 572

#if REALCASE == 1
            hv(1) = CONST_1_0
            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
          endif
#endif
#if COMPLEXCASE == 1
573 574
          hv(1) = CONST_COMPLEX_1_0
          hv(2:n) = ab(3:n+1,na_s-n_off)*hf
575
#endif
576 577 578 579 580 581 582 583 584 585 586 587 588
          d(istep) = ab(1,na_s-n_off)
          e(istep) = ab(2,na_s-n_off)
          if (istep == na-1) then
            d(na) = ab(1,na_s+1-n_off)
#if REALCASE == 1
            e(na) = CONST_0_0
#endif
#if COMPLEXCASE == 1
           e(na) = CONST_REAL_0_0
#endif
          endif
        else
          if (na>na_s) then
589
            ! Receive Householder Vector from previous task, from PE owning subdiagonal
590 591 592 593

#ifdef WITH_OPENMP

#ifdef WITH_MPI
594
            if (wantDebug) call obj%timer%start("mpi_communication")
595 596 597 598 599 600 601
            call mpi_recv(hv, nb,       &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
602
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
603
            if (wantDebug) call obj%timer%stop("mpi_communication")
604 605 606 607 608 609 610 611 612 613

#else /* WITH_MPI */

            hv(1:nb) = hv_s(1:nb)

#endif /* WITH_MPI */

#else /* WITH_OPENMP */

#ifdef WITH_MPI
614
            if (wantDebug) call obj%timer%start("mpi_communication")
615 616 617 618 619 620 621 622

            call mpi_recv(hv, nb,          &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
623
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
624
            if (wantDebug) call obj%timer%stop("mpi_communication")
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676

#else /* WITH_MPI */
            hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */

#endif /* WITH_OPENMP */
            tau = hv(1)
#if REALCASE == 1
            hv(1) = CONST_1_0
#endif
#if COMPLEXCASE == 1
            hv(1) = CONST_COMPLEX_1_0
#endif
          endif
        endif

        na_s = na_s+1
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
#if REALCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0
#endif
#if COMPLEXCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
#endif
          n_off = n_off + nb
        endif

#ifdef WITH_OPENMP
        if (max_threads > 1) then

          ! Codepath for OpenMP

          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
          ! This simulates the behaviour of the MPI tasks which also work after each other.
          ! The code would be considerably easier, if the MPI communication would be made within
          ! the parallel region - this is avoided here since this would require
          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.

          hv_t(:,1) = hv
          tau_t(1) = tau

          do iter = 1, 2

            ! iter=1 : work on first block
            ! iter=2 : work on remaining blocks
            ! This is done in 2 iterations so that we have a barrier in between:
            ! After the first iteration, it is guaranteed that the last row of the last block
            ! is completed by the next thread.
            ! After the first iteration it is also the place to exchange the last row
            ! with MPI calls
677
            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700

!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
            do my_thread = 1, max_threads

              if (iter == 1) then
                my_block_s = omp_block_limits(my_thread-1) + 1
                my_block_e = my_block_s
              else
                my_block_s = omp_block_limits(my_thread-1) + 2
                my_block_e = omp_block_limits(my_thread)
              endif

              do iblk = my_block_s, my_block_e

                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
                ne = ns+nb-1                    ! last column in block

                if (istep<my_thread .or. ns+n_off>na) exit

                hv = hv_t(:,my_thread)
                tau = tau_t(my_thread)

701
                ! Store Householder Vector for back transformation
702 703 704 705 706 707 708 709 710 711 712

                hh_cnt(iblk) = hh_cnt(iblk) + 1

                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

                ! Transform diagonal block
713
                if (wantDebug) call obj%timer%start("blas")
714 715 716 717 718 719
#if REALCASE == 1
                call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1)
#endif
720
                if (wantDebug) call obj%timer%stop("blas")
721 722
#if REALCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*tau
Andreas Marek's avatar
Andreas Marek committed
723
                hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc)
724 725 726 727 728
#endif
#if COMPLEXCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
                hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
#endif
729
                if (wantDebug) call obj%timer%start("blas")
730 731 732 733 734 735
#if REALCASE == 1
                call PRECISION_SYR2('L', nc, -CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
736
                if (wantDebug) call obj%timer%stop("blas")
737 738 739 740 741 742 743 744 745 746 747
#if REALCASE == 1
                hv_t(:,my_thread) = CONST_0_0
                tau_t(my_thread)  = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hv_t(:,my_thread) = CONST_COMPLEX_0_0
              tau_t(my_thread)  = CONST_COMPLEX_0_0
#endif
                if (nr<=0) cycle ! No subdiagonal block present any more

                ! Transform subdiagonal block
748
                if (wantDebug) call obj%timer%start("blas")
749 750
                call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
751
                        CONST_0_0, &
752 753 754 755
#endif
#if COMPLEXCASE == 1
                                    CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
756
            hs, 1)
757
                if (wantDebug) call obj%timer%stop("blas")
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
                if (nr>1) then

                  ! complete (old) Householder transformation for first column

                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

                  ! calculate new Householder transformation for first column
                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))

#if REALCASE == 1
                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
776
#endif /* COMPLEXCASE */
777

778 779 780 781 782 783
#if REALCASE == 1
                  call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
                  call hh_transform_complex_&
#endif
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
784
                  &PRECISION &
785
                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
786

787 788 789 790
#if REALCASE == 1
                  hv_t(1   ,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
791 792 793 794 795 796 797 798 799 800 801
                  hv_t(1   ,my_thread) = CONST_COMPLEX_1_0
#endif
                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
#if REALCASE == 1
                  ab(nb+2:,ns) = CONST_0_0
#endif
#if COMPLEXCASE == 1
                  ab(nb+2:,ns) = CONST_COMPLEX_0_0
#endif
                  ! update subdiagonal block for old and new Householder transformation
                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
802
                  if (wantDebug) call obj%timer%start("blas")
803 804 805 806 807 808
#if REALCASE == 1
                  call PRECISION_GEMV('T',            &
#endif
#if COMPLEXCASE == 1
                  call PRECISION_GEMV('C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
809
                          nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, &
810
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
811
              CONST_0_0,      &
812 813 814 815
#endif
#if COMPLEXCASE == 1
                                      CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
816
              h(2), 1)
817
                  if (wantDebug) call obj%timer%stop("blas")
818 819 820 821 822 823 824

                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
                  h(2:nb) = h(2:nb) - x*hv(2:nb)
                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
                  do i=2,nb
                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
825
                                      h(i) - hs(1:nr)*hv(i)
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
#endif
#if COMPLEXCASE == 1
                                                  conjg(h(i)) - hs(1:nr)*conjg(hv(i))
#endif
                  enddo

                else

                  ! No new Householder transformation for nr=1, just complete the old one
                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
                  do i=2,nb
#if REALCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
#endif
#if COMPLEXCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
#endif
                  enddo
                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
#if REALCASE == 1
                  hv_t(1,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
                  hv_t(1,my_thread) = CONST_COMPLEX_1_0
#endif
                endif

              enddo

            enddo ! my_thread
!$omp end parallel do

858
            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
859 860 861 862 863 864 865

            if (iter==1) then
              ! We are at the end of the first block

              ! Send our first column to previous PE
              if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
866
                if (wantDebug) call obj%timer%start("mpi_communication")
867
                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
868
                if (wantDebug) call obj%timer%stop("mpi_communication")
869 870 871 872

#endif
                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
873
                if (wantDebug) call obj%timer%start("mpi_communication")
874 875
                call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
876
                   MPI_REAL_PRECISION, &
877 878 879 880
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
881
             my_pe-1, 1, communicator, ireq_ab, mpierr)
882
                if (wantDebug) call obj%timer%stop("mpi_communication")
883 884 885 886 887 888 889

#endif /* WITH_MPI */
              endif

              ! Request last column from next PE
              ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
890
              if (wantDebug) call obj%timer%start("mpi_communication")
891 892 893 894

              if (istep>=max_threads .and. ne <= na) then
                call mpi_recv(ab(1,ne-n_off), nb+1,       &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
895
                  MPI_REAL_PRECISION, &
896 897 898 899
#endif
#if COMPLEXCASE == 1
                              MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
900
                              my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
901
              endif
902
              if (wantDebug) call obj%timer%stop("mpi_communication")
903 904 905 906 907 908 909 910
#else /* WITH_MPI */
              if (istep>=max_threads .and. ne <= na) then
                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
              endif
#endif /* WITH_MPI */
            else
              ! We are at the end of all blocks

911
              ! Send last HH Vector and TAU to next PE if it has been calculated above
912 913 914
              ne = na_s + nblocks*nb - (max_threads-1) - 1
              if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
915
                if (wantDebug) call obj%timer%start("mpi_communication")
916
                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
917
                if (wantDebug) call obj%timer%stop("mpi_communication")
918 919 920 921 922
#endif
                hv_s(1) = tau_t(max_threads)
                hv_s(2:) = hv_t(2:,max_threads)

#ifdef WITH_MPI
923
                if (wantDebug) call obj%timer%start("mpi_communication")
924 925
                call mpi_isend(hv_s, nb,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
926
                   MPI_REAL_PRECISION, &
927 928 929 930
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
931
                               my_pe+1, 2, communicator, ireq_hv, mpierr)
932
                if (wantDebug) call obj%timer%stop("mpi_communication")
933 934 935 936

#endif /* WITH_MPI */
              endif

937
              ! "Send" HH Vector and TAU to next OpenMP thread
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952
              do my_thread = max_threads, 2, -1
                hv_t(:,my_thread) = hv_t(:,my_thread-1)
                tau_t(my_thread)  = tau_t(my_thread-1)
              enddo

            endif
          enddo ! iter

        else

          ! Codepath for 1 thread without OpenMP

          ! The following code is structured in a way to keep waiting times for
          ! other PEs at a minimum, especially if there is only one block.
          ! For this reason, it requests the last column as late as possible
953
          ! and sends the Householder Vector and the first column as early
954 955 956 957 958 959 960 961 962 963
          ! as possible.

#endif /* WITH_OPENMP */

          do iblk=1,nblocks
            ns = na_s + (iblk-1)*nb - n_off ! first column in block
            ne = ns+nb-1                    ! last column in block

            if (ns+n_off>na) exit

964
            ! Store Householder Vector for back transformation
965 966 967 968 969 970 971 972 973 974

            hh_cnt(iblk) = hh_cnt(iblk) + 1

            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

#ifndef WITH_OPENMP
            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
              ! Wait for last transfer to finish
#ifdef WITH_MPI
975
              if (wantDebug) call obj%timer%start("mpi_communication")
976 977

              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
978
              if (wantDebug) call obj%timer%stop("mpi_communication")
979 980 981 982 983 984
#endif
              ! Copy vectors into send buffer
              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
              ! Send to destination

#ifdef WITH_MPI
985
              if (wantDebug) call obj%timer%start("mpi_communication")
986 987
              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
988
                       MPI_REAL_PRECISION, &
989 990 991 992 993
#endif
#if COMPLEXCASE == 1
                             MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
994
                             10+iblk, communicator, ireq_hhs(iblk), mpierr)
995
              if (wantDebug) call obj%timer%stop("mpi_communication")
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
#else /* WITH_MPI */
             ! do the post-poned irecv here
             startAddr = startAddr - hh_cnt(iblk)
#if REALCASE == 1
             hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif
#if COMPLEXCASE == 1
             hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif
#endif /* WITH_MPI */

            ! Reset counter and increase destination row
#if REALCASE == 1
              hh_cnt(iblk) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hh_cnt(iblk) = 0
#endif
              hh_dst(iblk) = hh_dst(iblk)+1
            endif

            ! The following code is structured in a way to keep waiting times for
            ! other PEs at a minimum, especially if there is only one block.
            ! For this reason, it requests the last column as late as possible
1020
            ! and sends the Householder Vector and the first column as early
1021 1022 1023 1024 1025 1026
            ! as possible.
#endif /* WITH_OPENMP */
            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

1027
            ! Multiply diagonal block and subdiagonal block with Householder Vector
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040

            if (iblk==nblocks .and. nc==nb) then

              ! We need the last column from the next PE.
              ! First do the matrix multiplications without last column ...

              ! Diagonal block, the contribution of the last element is added below!
#if REALCASE == 1
              ab(1,ne) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              ab(1,ne) = CONST_COMPLEX_0_0
#endif
1041
              if (wantDebug) call obj%timer%start("blas")
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051

#if REALCASE == 1
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hd,1)
#endif
              ! Subdiagonal block
              if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1052
                                      CONST_0_0, hs, 1)
1053 1054 1055 1056
#endif
#if COMPLEXCASE == 1
                                            CONST_COMPLEX_PAIR_0_0,hs,1)
#endif
1057
              if (wantDebug) call obj%timer%stop("blas")
1058 1059 1060

              ! ... then request last column ...
#ifdef WITH_MPI
1061
              if (wantDebug) call obj%timer%start("mpi_communication")
1062 1063 1064
#ifdef WITH_OPENMP
              call mpi_recv(ab(1,ne), nb+1,    &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1065
                      MPI_REAL_PRECISION, &
1066 1067 1068 1069
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1070
          my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1071 1072 1073
#else /* WITH_OPENMP */
              call mpi_recv(ab(1,ne), nb+1,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1074
                      MPI_REAL_PRECISION, &
1075 1076 1077 1078
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1079
                      my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1080
#endif /* WITH_OPENMP */
1081
              if (wantDebug) call obj%timer%stop("mpi_communication")
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
#else /* WITH_MPI */

              ab(1:nb+1,ne) = ab_s(1:nb+1)

#endif /* WITH_MPI */

              ! ... and complete the result
              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau