elpa2_tridiag_band_template.F90 48.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

54
#if REALCASE == 1
55
    subroutine tridiag_band_real_&
56 57
#endif
#if COMPLEXCASE == 1
58
    subroutine tridiag_band_complex_&
59
#endif
60
    &PRECISION &
61
    (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
62
    hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
63 64 65 66 67 68 69 70 71 72
    !-------------------------------------------------------------------------------
    ! tridiag_band_real/complex:
    ! Reduces a real symmetric band matrix to tridiagonal form
    !
    !  na          Order of matrix a
    !
    !  nb          Semi bandwith
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
Andreas Marek's avatar
Andreas Marek committed
73
    !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
74 75 76 77
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a
    !
78
    ! hh_trans : housholder vectors
79 80 81 82 83 84 85 86
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
87
    !  communicator
88 89
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
90
      use elpa_abstract_impl
91 92
      use elpa2_workload
      use precision
Andreas Marek's avatar
Andreas Marek committed
93
      use iso_c_binding
94
      use redist
95
      implicit none
96
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
97
      class(elpa_abstract_impl_t), intent(inout)   :: obj
98
      logical, intent(in)                          :: useGPU, wantDebug
99
      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
100
#ifdef USE_ASSUMED_SIZE
101
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,*)
102
#else
103
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,matrixCols)
104
#endif
Andreas Marek's avatar
Andreas Marek committed
105
      integer(kind=c_intptr_t)                     :: a_dev
106
      real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
107
      MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
108 109 110 111

      real(kind=rk)                     :: vnorm2
      MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)
112 113 114 115 116 117 118 119 120 121 122 123 124 125

      integer(kind=ik)                             :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
      integer(kind=ik)                             :: my_pe, n_pes, mpierr
      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
      integer(kind=ik)                             :: ireq_ab, ireq_hv
      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
!      integer(kind=ik)                            :: my_mpi_status(MPI_STATUS_SIZE)
#endif
!      integer(kind=ik), allocatable               :: mpi_statuses(:,:), global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: omp_block_limits(:)
126
      MATH_DATATYPE(kind=rck), allocatable        :: hv_t(:,:), tau_t(:)
127 128 129 130 131
      integer(kind=ik)                             :: omp_get_max_threads
#endif /* WITH_OPENMP */
      integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
      integer(kind=ik), allocatable                :: block_limits(:)
132
      MATH_DATATYPE(kind=rck), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
133 134 135 136 137 138
      integer                                      :: istat
      character(200)                               :: errorMessage

#ifndef WITH_MPI
      integer(kind=ik)                             :: startAddr
#endif
139
      call obj%timer%start("tridiag_band_&
140
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
141
      &" // &
142 143
      &PRECISION_SUFFIX &
      )
144
      if (wantDebug) call obj%timer%start("mpi_communication")
145 146
      call mpi_comm_rank(communicator,my_pe,mpierr)
      call mpi_comm_size(communicator,n_pes,mpierr)
147 148 149 150 151

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
152
      if (wantDebug) call obj%timer%stop("mpi_communication")
153 154 155 156 157 158

      ! Get global_id mapping 2D procssor coordinates to global id

      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
159 160
  &MATH_DATATYPE&
  &: error when allocating global_id "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
161
        stop 1
162 163 164 165 166 167 168 169 170
      endif

      global_id(:,:) = 0
      global_id(my_prow, my_pcol) = my_pe

#ifdef WITH_OPENMP
      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
171 172
  &MATH_DATATYPE&
  &: error when allocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
173
        stop 1
174 175 176 177
      endif
#endif

#ifdef WITH_MPI
178
      if (wantDebug) call obj%timer%start("mpi_communication")
179
#ifndef WITH_OPENMP
180
      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
181 182
#else
      global_id_tmp(:,:) = global_id(:,:)
183
      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
184 185 186
      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
187 188
  &MATH_DATATYPE&
  &: error when deallocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
189
        stop 1
190 191
      endif
#endif /* WITH_OPENMP */
192
      if (wantDebug) call obj%timer%stop("mpi_communication")
193 194 195 196 197 198 199 200 201 202 203
#endif /* WITH_MPI */

      ! Total number of blocks in the band:

      nblocks_total = (na-1)/nb + 1

      ! Set work distribution

      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
204 205
  &MATH_DATATYPE&
  &: error when allocating block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
206
        stop 1
207 208
      endif

209
      call divide_band(obj,nblocks_total, n_pes, block_limits)
210 211 212 213 214 215 216 217 218

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)

      ! allocate the part of the band matrix which is needed by this PE
      ! The size is 1 block larger than needed to avoid extensive shifts
      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
219 220
  &MATH_DATATYPE&
  &: error when allocating ab"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
221
        stop 1
222 223 224 225 226 227 228 229
      endif

      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety

      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb

      ! Redistribute band in a to ab
Andreas Marek's avatar
Andreas Marek committed
230 231 232 233
      call redist_band_&
      &MATH_DATATYPE&
      &_&
      &PRECISION&
234
      &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
235

236 237 238 239 240 241
      ! Calculate the workload for each sweep in the back transformation
      ! and the space requirements to hold the HH vectors

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
242 243
  &MATH_DATATYPE&
  &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
244
        stop 1
245 246
      endif

247
      call determine_workload(obj,na, nb, np_rows, limits)
248 249 250 251 252 253
      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      do n = 1, nblocks_total
254
        call determine_workload(obj, nx, nb, np_rows, limits)
255 256
        local_size = limits(my_prow+1) - limits(my_prow)
        ! add to number of householder vectors
257
        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
258 259 260 261 262 263 264 265 266
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_hh_vecs = num_hh_vecs + local_size
          num_chunks  = num_chunks+1
        endif
        nx = nx - nb
      enddo

      ! Allocate space for HH vectors

267
      allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
268
      if (istat .ne. 0) then
269 270
#if REALCASE == 1
        print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
271 272
#endif
#if COMPLEXCASE == 1
273 274
        print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
275
        stop 1
276 277 278 279 280 281 282
      endif

      ! Allocate and init MPI requests

      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
283 284
  &MATH_DATATYPE&
  &: error when allocating ireq_hhr"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
285
        stop 1
286 287 288 289
      endif
      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
290 291
  &MATH_DATATYEP&
  &: error when allocating ireq_hhs"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
292
        stop 1
293 294 295 296 297 298 299
      endif

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      nt = 0
      do n = 1, nblocks_total
300
        call determine_workload(obj,nx, nb, np_rows, limits)
301 302 303 304
        local_size = limits(my_prow+1) - limits(my_prow)
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_chunks  = num_chunks+1
#ifdef WITH_MPI
305
          if (wantDebug) call obj%timer%start("mpi_communication")
306
          call mpi_irecv(hh_trans(1,num_hh_vecs+1),          &
Andreas Marek's avatar
Retab  
Andreas Marek committed
307
                   nb*local_size,                           &
308
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
309
       MPI_REAL_PRECISION,                     &
310 311 312 313
#endif
#if COMPLEXCASE == 1
                        MPI_COMPLEX_EXPLICIT_PRECISION,           &
#endif
314
                        nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
315
          if (wantDebug) call obj%timer%stop("mpi_communication")
316 317 318

#else /* WITH_MPI */
          ! carefull non-block recv data copy must be done at wait or send
319 320
          ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)

321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
#endif /* WITH_MPI */
          num_hh_vecs = num_hh_vecs + local_size
        endif
        nx = nx - nb
        if (n == block_limits(nt+1)) then
          nt = nt + 1
        endif
      enddo
#ifdef WITH_MPI
      ireq_hhs(:) = MPI_REQUEST_NULL
#endif
      ! Buffers for gathering/sending the HH vectors

      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
337 338
  &MATH_DATATYPE&
  &: error when allocating hh_gath"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
339
        stop 1
340 341 342 343 344
      endif

      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
345 346
  &MATH_DATATYPE&
  &: error when allocating hh_send"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
347
        stop 1
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
      endif

#if REALCASE == 1
      hh_gath(:,:,:) = CONST_0_0
      hh_send(:,:,:) = CONST_0_0
#endif
#if COMPLEXCASE == 1
      hh_gath(:,:,:) = CONST_COMPLEX_0_0
      hh_send(:,:,:) = CONST_COMPLEX_0_0
#endif
      ! Some counters

      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
363 364
  &MATH_DATATYPE&
  &: error when allocating hh_cnt"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
365
        stop 1
366 367 368 369 370
      endif

      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
371 372
  &MATH_DATATYPE&
  &: error when allocating hh_dst"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
373
        stop 1
374 375
      endif

376
      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
377 378 379 380 381 382 383 384 385 386
      hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! Limits for sending

      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
387 388
  &MATH_DATATYPE&
  &: error when allocating snd_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
389
        stop 1
390 391
      endif
      do iblk=1,nblocks
392
        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
      enddo

#ifdef WITH_OPENMP
      ! OpenMP work distribution:

      max_threads = 1
#if REALCASE == 1
      max_threads = omp_get_max_threads()
#endif
#if COMPLEXCASE == 1
!$      max_threads = omp_get_max_threads()
#endif
      ! For OpenMP we need at least 2 blocks for every thread
      max_threads = MIN(max_threads, nblocks/2)
      if (max_threads==0) max_threads = 1

      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
412 413
  &MATH_DATATYPE&
  &: error when allocating omp_block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
414
        stop 1
415 416 417
      endif

      ! Get the OpenMP block limits
418
      call divide_band(obj,nblocks, max_threads, omp_block_limits)
419 420 421 422

      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
423 424
  &MATH_DATATYPE&
  &: error when allocating hv_t, tau_t"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
425
        stop 1
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
      endif

#if REALCASE == 1
      hv_t = 0
      tau_t = 0
#endif
#if COMPLEXCASE == 1
      hv_t = CONST_COMPLEX_0_0
      tau_t = CONST_COMPLEX_0_0
#endif
#endif /* WITH_OPENMP */

      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first column to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
448
        if (wantDebug) call obj%timer%start("mpi_communication")
449 450
        call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
451
                 MPI_REAL_PRECISION,  &
452 453 454 455
#endif
#if COMPLEXCASE == 1
                       MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
456
           my_pe-1, 1, communicator, ireq_ab, mpierr)
457
        if (wantDebug) call obj%timer%stop("mpi_communication")
458 459 460 461
#endif /* WITH_MPI */
      endif

#ifndef WITH_MPI
462
          startAddr = ubound(hh_trans,dim=2)
463
#endif /* WITH_MPI */
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503

#ifdef WITH_OPENMP
      do istep=1,na-1-block_limits(my_pe)*nb
#else
      do istep=1,na-1
#endif

        if (my_pe==0) then
          n = MIN(na-na_s,nb) ! number of rows to be reduced
#if REALCASE == 1
          hv(:) = CONST_0_0
          tau = CONST_0_0
#endif
#if COMPLEXCASE == 1
          hv(:) = CONST_COMPLEX_0_0
          tau = CONST_COMPLEX_0_0
#endif

          ! Transform first column of remaining matrix
#if REALCASE == 1
          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
#endif
#if COMPLEXCASE == 1
         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
         ! the last subdiagonal element a real number
#endif

#if REALCASE == 1
          if (istep < na-1) then
            ! Transform first column of remaining matrix
            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
          if (n<2) vnorm2 = 0. ! Safety only
504
#endif /* COMPLEXCASE */
505

506 507 508 509
#if REALCASE == 1
            call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
510
            call hh_transform_complex_&
511 512
#endif
&PRECISION &
513
                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
514 515 516 517 518 519 520

#if REALCASE == 1
            hv(1) = CONST_1_0
            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
          endif
#endif
#if COMPLEXCASE == 1
521 522
          hv(1) = CONST_COMPLEX_1_0
          hv(2:n) = ab(3:n+1,na_s-n_off)*hf
523
#endif
524 525 526 527 528 529 530 531 532 533 534 535 536
          d(istep) = ab(1,na_s-n_off)
          e(istep) = ab(2,na_s-n_off)
          if (istep == na-1) then
            d(na) = ab(1,na_s+1-n_off)
#if REALCASE == 1
            e(na) = CONST_0_0
#endif
#if COMPLEXCASE == 1
           e(na) = CONST_REAL_0_0
#endif
          endif
        else
          if (na>na_s) then
537
            ! Receive Householder Vector from previous task, from PE owning subdiagonal
538 539 540 541

#ifdef WITH_OPENMP

#ifdef WITH_MPI
542
            if (wantDebug) call obj%timer%start("mpi_communication")
543 544 545 546 547 548 549
            call mpi_recv(hv, nb,       &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
550
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
551
            if (wantDebug) call obj%timer%stop("mpi_communication")
552 553 554 555 556 557 558 559 560 561

#else /* WITH_MPI */

            hv(1:nb) = hv_s(1:nb)

#endif /* WITH_MPI */

#else /* WITH_OPENMP */

#ifdef WITH_MPI
562
            if (wantDebug) call obj%timer%start("mpi_communication")
563 564 565 566 567 568 569 570

            call mpi_recv(hv, nb,          &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
571
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
572
            if (wantDebug) call obj%timer%stop("mpi_communication")
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624

#else /* WITH_MPI */
            hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */

#endif /* WITH_OPENMP */
            tau = hv(1)
#if REALCASE == 1
            hv(1) = CONST_1_0
#endif
#if COMPLEXCASE == 1
            hv(1) = CONST_COMPLEX_1_0
#endif
          endif
        endif

        na_s = na_s+1
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
#if REALCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0
#endif
#if COMPLEXCASE == 1
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
#endif
          n_off = n_off + nb
        endif

#ifdef WITH_OPENMP
        if (max_threads > 1) then

          ! Codepath for OpenMP

          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
          ! This simulates the behaviour of the MPI tasks which also work after each other.
          ! The code would be considerably easier, if the MPI communication would be made within
          ! the parallel region - this is avoided here since this would require
          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.

          hv_t(:,1) = hv
          tau_t(1) = tau

          do iter = 1, 2

            ! iter=1 : work on first block
            ! iter=2 : work on remaining blocks
            ! This is done in 2 iterations so that we have a barrier in between:
            ! After the first iteration, it is guaranteed that the last row of the last block
            ! is completed by the next thread.
            ! After the first iteration it is also the place to exchange the last row
            ! with MPI calls
625
            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
            do my_thread = 1, max_threads

              if (iter == 1) then
                my_block_s = omp_block_limits(my_thread-1) + 1
                my_block_e = my_block_s
              else
                my_block_s = omp_block_limits(my_thread-1) + 2
                my_block_e = omp_block_limits(my_thread)
              endif

              do iblk = my_block_s, my_block_e

                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
                ne = ns+nb-1                    ! last column in block

                if (istep<my_thread .or. ns+n_off>na) exit

                hv = hv_t(:,my_thread)
                tau = tau_t(my_thread)

649
                ! Store Householder Vector for back transformation
650 651 652 653 654 655 656 657 658 659 660

                hh_cnt(iblk) = hh_cnt(iblk) + 1

                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

                ! Transform diagonal block
661
                if (wantDebug) call obj%timer%start("blas")
662 663 664 665 666 667
#if REALCASE == 1
                call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1)
#endif
668
                if (wantDebug) call obj%timer%stop("blas")
669 670
#if REALCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*tau
Andreas Marek's avatar
Andreas Marek committed
671
                hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc)
672 673 674 675 676
#endif
#if COMPLEXCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
                hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
#endif
677
                if (wantDebug) call obj%timer%start("blas")
678 679 680 681 682 683
#if REALCASE == 1
                call PRECISION_SYR2('L', nc, -CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
#if COMPLEXCASE == 1
                call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
684
                if (wantDebug) call obj%timer%stop("blas")
685 686 687 688 689 690 691 692 693 694 695
#if REALCASE == 1
                hv_t(:,my_thread) = CONST_0_0
                tau_t(my_thread)  = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hv_t(:,my_thread) = CONST_COMPLEX_0_0
              tau_t(my_thread)  = CONST_COMPLEX_0_0
#endif
                if (nr<=0) cycle ! No subdiagonal block present any more

                ! Transform subdiagonal block
696
                if (wantDebug) call obj%timer%start("blas")
697 698
                call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
699
                        CONST_0_0, &
700 701 702 703
#endif
#if COMPLEXCASE == 1
                                    CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
704
            hs, 1)
705
                if (wantDebug) call obj%timer%stop("blas")
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
                if (nr>1) then

                  ! complete (old) Householder transformation for first column

                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

                  ! calculate new Householder transformation for first column
                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))

#if REALCASE == 1
                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
724
#endif /* COMPLEXCASE */
725

726 727 728 729 730 731
#if REALCASE == 1
                  call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
                  call hh_transform_complex_&
#endif
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
732
                  &PRECISION &
733
                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
734

735 736 737 738
#if REALCASE == 1
                  hv_t(1   ,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
739 740 741 742 743 744 745 746 747 748 749
                  hv_t(1   ,my_thread) = CONST_COMPLEX_1_0
#endif
                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
#if REALCASE == 1
                  ab(nb+2:,ns) = CONST_0_0
#endif
#if COMPLEXCASE == 1
                  ab(nb+2:,ns) = CONST_COMPLEX_0_0
#endif
                  ! update subdiagonal block for old and new Householder transformation
                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
750
                  if (wantDebug) call obj%timer%start("blas")
751 752 753 754 755 756
#if REALCASE == 1
                  call PRECISION_GEMV('T',            &
#endif
#if COMPLEXCASE == 1
                  call PRECISION_GEMV('C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
757
                          nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, &
758
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
759
              CONST_0_0,      &
760 761 762 763
#endif
#if COMPLEXCASE == 1
                                      CONST_COMPLEX_PAIR_0_0, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
764
              h(2), 1)
765
                  if (wantDebug) call obj%timer%stop("blas")
766 767 768 769 770 771 772

                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
                  h(2:nb) = h(2:nb) - x*hv(2:nb)
                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
                  do i=2,nb
                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
773
                                      h(i) - hs(1:nr)*hv(i)
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
#endif
#if COMPLEXCASE == 1
                                                  conjg(h(i)) - hs(1:nr)*conjg(hv(i))
#endif
                  enddo

                else

                  ! No new Householder transformation for nr=1, just complete the old one
                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
                  do i=2,nb
#if REALCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
#endif
#if COMPLEXCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
#endif
                  enddo
                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
#if REALCASE == 1
                  hv_t(1,my_thread) = CONST_1_0
#endif
#if COMPLEXCASE == 1
                  hv_t(1,my_thread) = CONST_COMPLEX_1_0
#endif
                endif

              enddo

            enddo ! my_thread
!$omp end parallel do

806
            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
807 808 809 810 811 812 813

            if (iter==1) then
              ! We are at the end of the first block

              ! Send our first column to previous PE
              if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
814
                if (wantDebug) call obj%timer%start("mpi_communication")
815
                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
816
                if (wantDebug) call obj%timer%stop("mpi_communication")
817 818 819 820

#endif
                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
821
                if (wantDebug) call obj%timer%start("mpi_communication")
822 823
                call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
824
                   MPI_REAL_PRECISION, &
825 826 827 828
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
829
             my_pe-1, 1, communicator, ireq_ab, mpierr)
830
                if (wantDebug) call obj%timer%stop("mpi_communication")
831 832 833 834 835 836 837

#endif /* WITH_MPI */
              endif

              ! Request last column from next PE
              ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
838
              if (wantDebug) call obj%timer%start("mpi_communication")
839 840 841 842

              if (istep>=max_threads .and. ne <= na) then
                call mpi_recv(ab(1,ne-n_off), nb+1,       &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
843
                  MPI_REAL_PRECISION, &
844 845 846 847
#endif
#if COMPLEXCASE == 1
                              MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
848
                              my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
849
              endif
850
              if (wantDebug) call obj%timer%stop("mpi_communication")
851 852 853 854 855 856 857 858
#else /* WITH_MPI */
              if (istep>=max_threads .and. ne <= na) then
                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
              endif
#endif /* WITH_MPI */
            else
              ! We are at the end of all blocks

859
              ! Send last HH Vector and TAU to next PE if it has been calculated above
860 861 862
              ne = na_s + nblocks*nb - (max_threads-1) - 1
              if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
863
                if (wantDebug) call obj%timer%start("mpi_communication")
864
                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
865
                if (wantDebug) call obj%timer%stop("mpi_communication")
866 867 868 869 870
#endif
                hv_s(1) = tau_t(max_threads)
                hv_s(2:) = hv_t(2:,max_threads)

#ifdef WITH_MPI
871
                if (wantDebug) call obj%timer%start("mpi_communication")
872 873
                call mpi_isend(hv_s, nb,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
874
                   MPI_REAL_PRECISION, &
875 876 877 878
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
879
                               my_pe+1, 2, communicator, ireq_hv, mpierr)
880
                if (wantDebug) call obj%timer%stop("mpi_communication")
881 882 883 884

#endif /* WITH_MPI */
              endif

885
              ! "Send" HH Vector and TAU to next OpenMP thread
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
              do my_thread = max_threads, 2, -1
                hv_t(:,my_thread) = hv_t(:,my_thread-1)
                tau_t(my_thread)  = tau_t(my_thread-1)
              enddo

            endif
          enddo ! iter

        else

          ! Codepath for 1 thread without OpenMP

          ! The following code is structured in a way to keep waiting times for
          ! other PEs at a minimum, especially if there is only one block.
          ! For this reason, it requests the last column as late as possible
901
          ! and sends the Householder Vector and the first column as early
902 903 904 905 906 907 908 909 910 911
          ! as possible.

#endif /* WITH_OPENMP */

          do iblk=1,nblocks
            ns = na_s + (iblk-1)*nb - n_off ! first column in block
            ne = ns+nb-1                    ! last column in block

            if (ns+n_off>na) exit

912
            ! Store Householder Vector for back transformation
913 914 915 916 917 918 919 920 921 922

            hh_cnt(iblk) = hh_cnt(iblk) + 1

            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

#ifndef WITH_OPENMP
            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
              ! Wait for last transfer to finish
#ifdef WITH_MPI
923
              if (wantDebug) call obj%timer%start("mpi_communication")
924 925

              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
926
              if (wantDebug) call obj%timer%stop("mpi_communication")
927 928 929 930 931 932
#endif
              ! Copy vectors into send buffer
              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
              ! Send to destination

#ifdef WITH_MPI
933
              if (wantDebug) call obj%timer%start("mpi_communication")
934 935
              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
936
                       MPI_REAL_PRECISION, &
937 938 939 940 941
#endif
#if COMPLEXCASE == 1
                             MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
942
                             10+iblk, communicator, ireq_hhs(iblk), mpierr)
943
              if (wantDebug) call obj%timer%stop("mpi_communication")
944 945 946
#else /* WITH_MPI */
             ! do the post-poned irecv here
             startAddr = startAddr - hh_cnt(iblk)
947
             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962
#endif /* WITH_MPI */

            ! Reset counter and increase destination row
#if REALCASE == 1
              hh_cnt(iblk) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              hh_cnt(iblk) = 0
#endif
              hh_dst(iblk) = hh_dst(iblk)+1
            endif

            ! The following code is structured in a way to keep waiting times for
            ! other PEs at a minimum, especially if there is only one block.
            ! For this reason, it requests the last column as late as possible
963
            ! and sends the Householder Vector and the first column as early
964 965 966 967 968 969
            ! as possible.
#endif /* WITH_OPENMP */
            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

970
            ! Multiply diagonal block and subdiagonal block with Householder Vector
971 972 973 974 975 976 977 978 979 980 981 982 983

            if (iblk==nblocks .and. nc==nb) then

              ! We need the last column from the next PE.
              ! First do the matrix multiplications without last column ...

              ! Diagonal block, the contribution of the last element is added below!
#if REALCASE == 1
              ab(1,ne) = CONST_0_0
#endif
#if COMPLEXCASE == 1
              ab(1,ne) = CONST_COMPLEX_0_0
#endif
984
              if (wantDebug) call obj%timer%start("blas")
985 986 987 988 989 990 991 992 993 994

#if REALCASE == 1
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hd,1)
#endif
              ! Subdiagonal block
              if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
995
                                      CONST_0_0, hs, 1)
996 997 998 999
#endif
#if COMPLEXCASE == 1
                                            CONST_COMPLEX_PAIR_0_0,hs,1)
#endif
1000
              if (wantDebug) call obj%timer%stop("blas")
1001 1002 1003

              ! ... then request last column ...
#ifdef WITH_MPI
1004
              if (wantDebug) call obj%timer%start("mpi_communication")
1005 1006 1007
#ifdef WITH_OPENMP
              call mpi_recv(ab(1,ne), nb+1,    &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1008
                      MPI_REAL_PRECISION, &
1009 1010 1011 1012
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1013
          my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1014 1015 1016
#else /* WITH_OPENMP */
              call mpi_recv(ab(1,ne), nb+1,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1017
                      MPI_REAL_PRECISION, &
1018 1019 1020 1021
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
1022
                      my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
1023
#endif /* WITH_OPENMP */
1024
              if (wantDebug) call obj%timer%stop("mpi_communication")
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
#else /* WITH_MPI */

              ab(1:nb+1,ne) = ab_s(1:nb+1)

#endif /* WITH_MPI */

              ! ... and complete the result
              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau

            else

              ! Normal matrix multiply
1038
              if (wantDebug) call obj%timer%start("blas")
1039 1040 1041 1042 1043 1044 1045 1046
#if REALCASE == 1
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
#endif
#if COMPLEXCASE == 1
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1)
#endif
              if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
1047
                                      CONST_0_0, hs, 1)
1048 1049 1050 1051 1052
#endif
#if COMPLEXCASE == 1
                                            CONST_COMPLEX_PAIR_0_0, hs, 1)

#endif
1053
              if (wantDebug) call obj%timer%stop("blas")