elpa2_tridiag_band_template.F90 49.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

54
#if REALCASE == 1
55
    subroutine tridiag_band_real_&
56 57
#endif
#if COMPLEXCASE == 1
58
    subroutine tridiag_band_complex_&
59
#endif
60
    &PRECISION &
61
    (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
62
    hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
63 64 65 66 67 68 69 70 71 72
    !-------------------------------------------------------------------------------
    ! tridiag_band_real/complex:
    ! Reduces a real symmetric band matrix to tridiagonal form
    !
    !  na          Order of matrix a
    !
    !  nb          Semi bandwith
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
Andreas Marek's avatar
Andreas Marek committed
73
    !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
74 75 76 77
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a
    !
78
    ! hh_trans : housholder vectors
79 80 81 82 83 84 85 86
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
87
    !  communicator
88 89
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
90
      use elpa_abstract_impl
91 92
      use elpa2_workload
      use precision
Andreas Marek's avatar
Andreas Marek committed
93
      use iso_c_binding
94
      use redist
95
      implicit none
96
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
97
      class(elpa_abstract_impl_t), intent(inout)   :: obj
98
      logical, intent(in)                          :: useGPU, wantDebug
99
      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
100 101
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
102
      real(kind=REAL_DATATYPE), intent(in)         :: aMatrix(lda,*)
103
#else
Andreas Marek's avatar
Andreas Marek committed
104
      real(kind=REAL_DATATYPE), intent(in)         :: aMatrix(lda,matrixCols)
105
#endif
106
#endif /* REALCASE */
107 108
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
109
      complex(kind=COMPLEX_DATATYPE),intent(in)    :: aMatrix(lda,*)
110
#else
Andreas Marek's avatar
Andreas Marek committed
111
      complex(kind=COMPLEX_DATATYPE), intent(in)   :: aMatrix(lda,matrixCols)
112
#endif
113
#endif /* COMPLEXCASE */
Andreas Marek's avatar
Andreas Marek committed
114
      integer(kind=c_intptr_t)                     :: a_dev
115
      real(kind=REAL_DATATYPE), intent(out)        :: d(na), e(na) ! set only on PE 0
116
      MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
      real(kind=REAL_DATATYPE)                     :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      real(kind=REAL_DATATYPE)                     :: hd(nb), hs(nb)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)               :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      complex(kind=COMPLEX_DATATYPE)               :: hd(nb), hs(nb)
#endif

      integer(kind=ik)                             :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
      integer(kind=ik)                             :: my_pe, n_pes, mpierr
      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
      integer(kind=ik)                             :: ireq_ab, ireq_hv
      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
!      integer(kind=ik)                            :: my_mpi_status(MPI_STATUS_SIZE)
#endif
!      integer(kind=ik), allocatable               :: mpi_statuses(:,:), global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: omp_block_limits(:)
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable        :: hv_t(:,:), tau_t(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable  :: hv_t(:,:), tau_t(:)
#endif
      integer(kind=ik)                             :: omp_get_max_threads
#endif /* WITH_OPENMP */
      integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
      integer(kind=ik), allocatable                :: block_limits(:)
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable  :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
#endif
      integer                                      :: istat
      character(200)                               :: errorMessage

#ifndef WITH_MPI
      integer(kind=ik)                             :: startAddr
#endif
163
      call obj%timer%start("tridiag_band_&
164
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
165
      &" // &
166 167
      &PRECISION_SUFFIX &
      )
168
      if (wantDebug) call obj%timer%start("mpi_communication")
169 170
      call mpi_comm_rank(communicator,my_pe,mpierr)
      call mpi_comm_size(communicator,n_pes,mpierr)
171 172 173 174 175

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
176
      if (wantDebug) call obj%timer%stop("mpi_communication")
177 178 179 180 181 182

      ! Get global_id mapping 2D procssor coordinates to global id

      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
183 184
  &MATH_DATATYPE&
  &: error when allocating global_id "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
185
        stop 1
186 187 188 189 190 191 192 193 194
      endif

      global_id(:,:) = 0
      global_id(my_prow, my_pcol) = my_pe

#ifdef WITH_OPENMP
      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
195 196
  &MATH_DATATYPE&
  &: error when allocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
197
        stop 1
198 199 200 201
      endif
#endif

#ifdef WITH_MPI
202
      if (wantDebug) call obj%timer%start("mpi_communication")
203
#ifndef WITH_OPENMP
204
      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
205 206
#else
      global_id_tmp(:,:) = global_id(:,:)
207
      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
208 209 210
      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
211 212
  &MATH_DATATYPE&
  &: error when deallocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
213
        stop 1
214 215
      endif
#endif /* WITH_OPENMP */
216
      if (wantDebug) call obj%timer%stop("mpi_communication")
217 218 219 220 221 222 223 224 225 226 227
#endif /* WITH_MPI */

      ! Total number of blocks in the band:

      nblocks_total = (na-1)/nb + 1

      ! Set work distribution

      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
228 229
  &MATH_DATATYPE&
  &: error when allocating block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
230
        stop 1
231 232
      endif

233
      call divide_band(obj,nblocks_total, n_pes, block_limits)
234 235 236 237 238 239 240 241 242

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)

      ! allocate the part of the band matrix which is needed by this PE
      ! The size is 1 block larger than needed to avoid extensive shifts
      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
243 244
  &MATH_DATATYPE&
  &: error when allocating ab"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
245
        stop 1
246 247 248 249 250 251 252 253
      endif

      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety

      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb

      ! Redistribute band in a to ab
Andreas Marek's avatar
Andreas Marek committed
254 255 256 257
      call redist_band_&
      &MATH_DATATYPE&
      &_&
      &PRECISION&
258
      &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
259

260 261 262 263 264 265
      ! Calculate the workload for each sweep in the back transformation
      ! and the space requirements to hold the HH vectors

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
266 267
  &MATH_DATATYPE&
  &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
268
        stop 1
269 270
      endif

271
      call determine_workload(obj,na, nb, np_rows, limits)
272 273 274 275 276 277
      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      do n = 1, nblocks_total
278
        call determine_workload(obj, nx, nb, np_rows, limits)
279 280
        local_size = limits(my_prow+1) - limits(my_prow)
        ! add to number of householder vectors
281
        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
282 283 284 285 286 287 288 289 290
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_hh_vecs = num_hh_vecs + local_size
          num_chunks  = num_chunks+1
        endif
        nx = nx - nb
      enddo

      ! Allocate space for HH vectors

291
      allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
292
      if (istat .ne. 0) then
293 294
#if REALCASE == 1
        print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
295 296
#endif
#if COMPLEXCASE == 1
297 298
        print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
299
        stop 1
300 301 302 303 304 305 306
      endif

      ! Allocate and init MPI requests

      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
307 308
  &MATH_DATATYPE&
  &: error when allocating ireq_hhr"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
309
        stop 1
310 311 312 313
      endif
      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
314 315
  &MATH_DATATYEP&
  &: error when allocating ireq_hhs"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
316
        stop 1
317 318 319 320 321 322 323
      endif

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      nt = 0
      do n = 1, nblocks_total
324
        call determine_workload(obj,nx, nb, np_rows, limits)
325 326 327 328
        local_size = limits(my_prow+1) - limits(my_prow)
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_chunks  = num_chunks+1
#ifdef WITH_MPI
329
          if (wantDebug) call obj%timer%start("mpi_communication")
330
          call mpi_irecv(hh_trans(1,num_hh_vecs+1),          &
Andreas Marek's avatar
Retab  
Andreas Marek committed
331
                   nb*local_size,                           &
332
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
333
       MPI_REAL_PRECISION,                     &
334 335 336 337
#endif
#if COMPLEXCASE == 1
                        MPI_COMPLEX_EXPLICIT_PRECISION,           &
#endif
338
                        nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
339
          if (wantDebug) call obj%timer%stop("mpi_communication")
340 341 342

#else /* WITH_MPI */
          ! carefull non-block recv data copy must be done at wait or send
343 344
          ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
#endif /* WITH_MPI */
          num_hh_vecs = num_hh_vecs + local_size
        endif
        nx = nx - nb
        if (n == block_limits(nt+1)) then
          nt = nt + 1
        endif
      enddo
#ifdef WITH_MPI
      ireq_hhs(:) = MPI_REQUEST_NULL
#endif
      ! Buffers for gathering/sending the HH vectors

      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
361 362
  &MATH_DATATYPE&
  &: error when allocating hh_gath"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
363
        stop 1
364 365 366 367 368
      endif

      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
369 370
  &MATH_DATATYPE&
  &: error when allocating hh_send"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
371
        stop 1
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
      endif

#if REALCASE == 1
      hh_gath(:,:,:) = CONST_0_0
      hh_send(:,:,:) = CONST_0_0
#endif
#if COMPLEXCASE == 1
      hh_gath(:,:,:) = CONST_COMPLEX_0_0
      hh_send(:,:,:) = CONST_COMPLEX_0_0
#endif
      ! Some counters

      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
387 388
  &MATH_DATATYPE&
  &: error when allocating hh_cnt"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
389
        stop 1
390 391 392 393 394
      endif

      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
395 396
  &MATH_DATATYPE&
  &: error when allocating hh_dst"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
397
        stop 1
398 399
      endif

400
      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
401 402 403 404 405 406 407 408 409 410
      hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! Limits for sending

      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
411 412
  &MATH_DATATYPE&
  &: error when allocating snd_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
413
        stop 1
414 415
      endif
      do iblk=1,nblocks
416
        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
      enddo

#ifdef WITH_OPENMP
      ! OpenMP work distribution:

      max_threads = 1
#if REALCASE == 1
      max_threads = omp_get_max_threads()
#endif
#if COMPLEXCASE == 1
!$      max_threads = omp_get_max_threads()
#endif
      ! For OpenMP we need at least 2 blocks for every thread
      max_threads = MIN(max_threads, nblocks/2)
      if (max_threads==0) max_threads = 1

      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
436 437
  &MATH_DATATYPE&
  &: error when allocating omp_block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
438
        stop 1
439 440 441
      endif

      ! Get the OpenMP block limits
442
      call divide_band(obj,nblocks, max_threads, omp_block_limits)
443 444 445 446

      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
447 448
  &MATH_DATATYPE&
  &: error when allocating hv_t, tau_t"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
449
        stop 1
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
      endif

#if REALCASE == 1
      hv_t = 0
      tau_t = 0
#endif
#if COMPLEXCASE == 1
      hv_t = CONST_COMPLEX_0_0
      tau_t = CONST_COMPLEX_0_0
#endif
#endif /* WITH_OPENMP */

      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first column to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
472
        if (wantDebug) call obj%timer%start("mpi_communication")
473 474
        call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
475
                 MPI_REAL_PRECISION,  &
476 477 478 479
#endif
#if COMPLEXCASE == 1
                       MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
480
           my_pe-1, 1, communicator, ireq_ab, mpierr)
481
        if (wantDebug) call obj%timer%stop("mpi_communication")
482 483 484 485
#endif /* WITH_MPI */
      endif

#ifndef WITH_MPI
486
          startAddr = ubound(hh_trans,dim=2)
487
#endif /* WITH_MPI */
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527

#ifdef WITH_OPENMP
      do istep=1,na-1-block_limits(my_pe)*nb
#else
      do istep=1,na-1
#endif

        if (my_pe==0) then
          n = MIN(na-na_s,nb) ! number of rows to be reduced
#if REALCASE == 1
          hv(:) = CONST_0_0
          tau = CONST_0_0
#endif
#if COMPLEXCASE == 1
          hv(:) = CONST_COMPLEX_0_0
          tau = CONST_COMPLEX_0_0
#endif

          ! Transform first column of remaining matrix
#if REALCASE == 1
          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
#endif
#if COMPLEXCASE == 1
         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
         ! the last subdiagonal element a real number
#endif

#if REALCASE == 1
          if (istep < na-1) then
            ! Transform first column of remaining matrix
            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
          if (n<2) vnorm2 = 0. ! Safety only
528
#endif /* COMPLEXCASE */
529

530 531 532 533
#if REALCASE == 1
            call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
534
            call hh_transform_complex_&
535 536
#endif
&PRECISION &
537
                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
538 539 540 541 542 543 544

#if REALCASE == 1
            hv(1) = CONST_1_0
            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
          endif
#endif
#if COMPLEXCASE == 1
545 546
          hv(1) = CONST_COMPLEX_1_0
          hv(2:n) = ab(3:n+1,na_s-n_off)*hf
547
#endif
548 549 550 551 552 553 554 555 556 557 558 559 560
          d(istep) = ab(1,na_s-n_off)
          e(istep) = ab(2,na_s-n_off)
          if (istep == na-1) then
            d(na) = ab(1,na_s+1-n_off)
#if REALCASE == 1
            e(na) = CONST_0_0
#endif
#if COMPLEXCASE == 1
           e(na) = CONST_REAL_0_0
#endif
          endif
        else
          if (na>na_s) then
561
            ! Receive Householder Vector from previous task, from PE owning subdiagonal
562 563 564 565

#ifdef WITH_OPENMP

#ifdef WITH_MPI
566
            if (wantDebug) call obj%timer%start("mpi_communication")
567 568 569 570 571 572 573
            call mpi_recv(hv, nb,       &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
574
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
575
            if (wantDebug) call obj%timer%stop("mpi_communication")
576 577 578 579 580 581 582 583 584 585

#else /* WITH_MPI */

            hv(1:nb) = hv_s(1:nb)

#endif /* WITH_MPI */

#else /* WITH_OPENMP */

#ifdef WITH_MPI
586
            if (wantDebug) call obj%timer%start("mpi_communication")
587 588 589 590 591 592 593 594

            call mpi_recv(hv, nb,          &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
595
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
596
            if (wantDebug) call obj%timer%stop("mpi_communication")
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

#else /* WITH_MPI */
            hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */

#endif /* WITH_OPENMP */
            tau = hv(1)
#if REALCASE == 1
            hv(1) = CONST_1_0
#endif
#if COMPLEXCASE == 1
            hv(1) = CONST_COMPLEX_1_0
#endif
          endif
        endif

        na_s = na_s+1
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
#if REALCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0
#endif
#if COMPLEXCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
#endif
          n_off = n_off + nb
        endif

#ifdef WITH_OPENMP
        if (max_threads > 1) then

          ! Codepath for OpenMP

          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
          ! This simulates the behaviour of the MPI tasks which also work after each other.
          ! The code would be considerably easier, if the MPI communication would be made within
          ! the parallel region - this is avoided here since this would require
          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.

          hv_t(:,1) = hv
          tau_t(1) = tau

          do iter = 1, 2

            ! iter=1 : work on first block
            ! iter=2 : work on remaining blocks
            ! This is done in 2 iterations so that we have a barrier in between:
            ! After the first iteration, it is guaranteed that the last row of the last block
            ! is completed by the next thread.
            ! After the first iteration it is also the place to exchange the last row
            ! with MPI calls
649
            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672

!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
            do my_thread = 1, max_threads

              if (iter == 1) then
                my_block_s = omp_block_limits(my_thread-1) + 1
                my_block_e = my_block_s
              else
                my_block_s = omp_block_limits(my_thread-1) + 2
                my_block_e = omp_block_limits(my_thread)
              endif

              do iblk = my_block_s, my_block_e

                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
                ne = ns+nb-1                    ! last column in block

                if (istep<my_thread .or. ns+n_off>na) exit

                hv = hv_t(:,my_thread)
                tau = tau_t(my_thread)

673
                ! Store Householder Vector for back transformation
674 675 676 677 678 679 680 681 682 683 684

                hh_cnt(iblk) = hh_cnt(iblk) + 1

                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

                ! Transform diagonal block
685
                if (wantDebug) call obj%timer%start("blas")
686 687 688 689 690 691
#if REALCASE == 1
                call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1)
#endif
692
                if (wantDebug) call obj%timer%stop("blas")
693 694
#if REALCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*tau
Andreas Marek's avatar
Andreas Marek committed
695
                hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc)
696 697 698 699 700
#endif
#if COMPLEXCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
                hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
#endif
701
                if (wantDebug) call obj%timer%start("blas")
702 703 704 705 706 707
#if REALCASE == 1
                call PRECISION_SYR2('L', nc, -CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
708
                if (wantDebug) call obj%timer%stop("blas")
709 710 711 712 713 714 715 716 717 718 719
#if REALCASE == 1
                hv_t(:,my_thread) = CONST_0_0
                tau_t(my_thread)  = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hv_t(:,my_thread) = CONST_COMPLEX_0_0
              tau_t(my_thread)  = CONST_COMPLEX_0_0
#endif
                if (nr<=0) cycle ! No subdiagonal block present any more

                ! Transform subdiagonal block
720
                if (wantDebug) call obj%timer%start("blas")
721 722
                call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
723
                        CONST_0_0, &
724 725 726 727
#endif
#if COMPLEXCASE == 1
                                    CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
728
            hs, 1)
729
                if (wantDebug) call obj%timer%stop("blas")
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
                if (nr>1) then

                  ! complete (old) Householder transformation for first column

                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

                  ! calculate new Householder transformation for first column
                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))

#if REALCASE == 1
                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
748
#endif /* COMPLEXCASE */
749

750 751 752 753 754 755
#if REALCASE == 1
                  call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
                  call hh_transform_complex_&
#endif
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
756
                  &PRECISION &
757
                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
758

759 760 761 762
#if REALCASE == 1
                  hv_t(1   ,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
763 764 765 766 767 768 769 770 771 772 773
                  hv_t(1   ,my_thread) = CONST_COMPLEX_1_0
#endif
                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
#if REALCASE == 1
                  ab(nb+2:,ns) = CONST_0_0
#endif
#if COMPLEXCASE == 1
                  ab(nb+2:,ns) = CONST_COMPLEX_0_0
#endif
                  ! update subdiagonal block for old and new Householder transformation
                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
774
                  if (wantDebug) call obj%timer%start("blas")
775 776 777 778 779 780
#if REALCASE == 1
                  call PRECISION_GEMV('T',            &
#endif
#if COMPLEXCASE == 1
                  call PRECISION_GEMV('C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
781
                          nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, &
782
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
783
              CONST_0_0,      &
784 785 786 787
#endif
#if COMPLEXCASE == 1
                                      CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
788
              h(2), 1)
789
                  if (wantDebug) call obj%timer%stop("blas")
790 791 792 793 794 795 796

                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
                  h(2:nb) = h(2:nb) - x*hv(2:nb)
                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
                  do i=2,nb
                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
797
                                      h(i) - hs(1:nr)*hv(i)
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
#endif
#if COMPLEXCASE == 1
                                                  conjg(h(i)) - hs(1:nr)*conjg(hv(i))
#endif
                  enddo

                else

                  ! No new Householder transformation for nr=1, just complete the old one
                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
                  do i=2,nb
#if REALCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
#endif
#if COMPLEXCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
#endif
                  enddo
                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
#if REALCASE == 1
                  hv_t(1,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
                  hv_t(1,my_thread) = CONST_COMPLEX_1_0
#endif
                endif

              enddo

            enddo ! my_thread
!$omp end parallel do

830
            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
831 832 833 834 835 836 837

            if (iter==1) then
              ! We are at the end of the first block

              ! Send our first column to previous PE
              if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
838
                if (wantDebug) call obj%timer%start("mpi_communication")
839
                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
840
                if (wantDebug) call obj%timer%stop("mpi_communication")
841 842 843 844

#endif
                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
845
                if (wantDebug) call obj%timer%start("mpi_communication")
846 847
                call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
848
                   MPI_REAL_PRECISION, &
849 850 851 852
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
853
             my_pe-1, 1, communicator, ireq_ab, mpierr)
854
                if (wantDebug) call obj%timer%stop("mpi_communication")
855 856 857 858 859 860 861

#endif /* WITH_MPI */
              endif

              ! Request last column from next PE
              ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
862
              if (wantDebug) call obj%timer%start("mpi_communication")
863 864 865 866

              if (istep>=max_threads .and. ne <= na) then
                call mpi_recv(ab(1,ne-n_off), nb+1,       &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
867
                  MPI_REAL_PRECISION, &
868 869 870 871
#endif
#if COMPLEXCASE == 1
                              MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
872
                              my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
873
              endif
874
              if (wantDebug) call obj%timer%stop("mpi_communication")
875 876 877 878 879 880 881 882
#else /* WITH_MPI */
              if (istep>=max_threads .and. ne <= na) then
                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
              endif
#endif /* WITH_MPI */
            else
              ! We are at the end of all blocks

883
              ! Send last HH Vector and TAU to next PE if it has been calculated above
884 885 886
              ne = na_s + nblocks*nb - (max_threads-1) - 1
              if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
887
                if (wantDebug) call obj%timer%start("mpi_communication")
888
                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
889
                if (wantDebug) call obj%timer%stop("mpi_communication")
890 891 892 893 894
#endif
                hv_s(1) = tau_t(max_threads)
                hv_s(2:) = hv_t(2:,max_threads)

#ifdef WITH_MPI
895
                if (wantDebug) call obj%timer%start("mpi_communication")
896 897
                call mpi_isend(hv_s, nb,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
898
                   MPI_REAL_PRECISION, &
899 900 901 902
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
903
                               my_pe+1, 2, communicator, ireq_hv, mpierr)
904
                if (wantDebug) call obj%timer%stop("mpi_communication")
905 906 907 908

#endif /* WITH_MPI */
              endif

909
              ! "Send" HH Vector and TAU to next OpenMP thread
910 911 912 913 914 915 916 917 918 919 920 921 922 923 924
              do my_thread = max_threads, 2, -1
                hv_t(:,my_thread) = hv_t(:,my_thread-1)
                tau_t(my_thread)  = tau_t(my_thread-1)
              enddo

            endif
          enddo ! iter

        else

          ! Codepath for 1 thread without OpenMP

          ! The following code is structured in a way to keep waiting times for
          ! other PEs at a minimum, especially if there is only one block.
          ! For this reason, it requests the last column as late as possible
925
          ! and sends the Householder Vector and the first column as early
926 927 928 929 930 931 932 933 934 935
          ! as possible.

#endif /* WITH_OPENMP */

          do iblk=1,nblocks
            ns = na_s + (iblk-1)*nb - n_off ! first column in block
            ne = ns+nb-1                    ! last column in block

            if (ns+n_off>na) exit

936
            ! Store Householder Vector for back transformation
937 938 939 940 941 942 943 944 945 946

            hh_cnt(iblk) = hh_cnt(iblk) + 1

            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

#ifndef WITH_OPENMP
            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
              ! Wait for last transfer to finish
#ifdef WITH_MPI
947
              if (wantDebug) call obj%timer%start("mpi_communication")
948 949

              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
950
              if (wantDebug) call obj%timer%stop("mpi_communication")
951 952 953 954 955 956
#endif
              ! Copy vectors into send buffer
              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
              ! Send to destination

#ifdef WITH_MPI
957
              if (wantDebug) call obj%timer%start("mpi_communication")
958 959
              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
960
                       MPI_REAL_PRECISION, &
961 962 963 964 965
#endif
#if COMPLEXCASE == 1
                             MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
966
                             10+iblk, communicator, ireq_hhs(iblk), mpierr)
967
              if (wantDebug) call obj%timer%stop("mpi_communication")
968 969 970
#else /* WITH_MPI */
             ! do the post-poned irecv here
             startAddr = startAddr - hh_cnt(iblk)
971
             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
#endif /* WITH_MPI */

            ! Reset counter and increase destination row
#if REALCASE == 1
              hh_cnt(iblk) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hh_cnt(iblk) = 0
#endif
              hh_dst(iblk) = hh_dst(iblk)+1
            endif

            ! The following code is structured in a way to keep waiting times for
            ! other PEs at a minimum, especially if there is only one block.
            ! For this reason, it requests the last column as late as possible
987
            ! and sends the Householder Vector and the first column as early
988 989 990 991 992 993
            ! as possible.
#endif /* WITH_OPENMP */
            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

994
            ! Multiply diagonal block and subdiagonal block with Householder Vector
995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007

            if (iblk==nblocks .and. nc==nb) then

              ! We need the last column from the next PE.
              ! First do the matrix multiplications without last column ...

              ! Diagonal block, the contribution of the last element is added below!
#if REALCASE == 1
              ab(1,ne) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              ab(1,ne) = CONST_COMPLEX_0_0
#endif
1008
              if (wantDebug) call obj%timer%start("blas")
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018

#if REALCASE == 1
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hd,1)
#endif
              ! Subdiagonal block
              if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1019
                                      CONST_0_0, hs, 1)
1020 1021 1022 1023
#endif
#if COMPLEXCASE == 1
                                            CONST_COMPLEX_PAIR_0_0,hs,1)
#endif
1024
              if (wantDebug) call obj%timer%stop("blas")
1025 1026 1027

              ! ... then request last column ...
#ifdef WITH_MPI
1028
              if (wantDebug) call obj%timer%start("mpi_communication")
1029 1030 1031
#ifdef WITH_OPENMP
              call mpi_recv(ab(1,ne), nb+1,    &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1032
                      MPI_REAL_PRECISION, &
1033 1034 1035 1036
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1037
          my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1038 1039 1040
#else /* WITH_OPENMP */
              call mpi_recv(ab(1,ne), nb+1,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1041
                      MPI_REAL_PRECISION, &
1042 1043 1044 1045
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1046
                      my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1047
#endif /* WITH_OPENMP */
1048
              if (wantDebug) call obj%timer%stop("mpi_communication")
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
#else /* WITH_MPI */

              ab(1:nb+1,ne) = ab_s(1:nb+1)

#endif /* WITH_MPI */

              ! ... and complete the result
              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau

            else

              ! Normal matrix multiply
1062
              if (wantDebug) call obj%timer%start("blas")