elpa2_tridiag_band_template.F90 45.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

54
#if REALCASE == 1
55
    subroutine tridiag_band_real_&
56 57
#endif
#if COMPLEXCASE == 1
58
    subroutine tridiag_band_complex_&
59
#endif
60
    &PRECISION &
61
    (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
62
    hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug)
63 64 65 66 67 68 69 70 71 72
    !-------------------------------------------------------------------------------
    ! tridiag_band_real/complex:
    ! Reduces a real symmetric band matrix to tridiagonal form
    !
    !  na          Order of matrix a
    !
    !  nb          Semi bandwith
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
Andreas Marek's avatar
Andreas Marek committed
73
    !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
74 75 76 77
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a
    !
78
    ! hh_trans : housholder vectors
79 80 81 82 83 84 85 86
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
87
    !  communicator
88 89
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
90
      use elpa_abstract_impl
91 92
      use elpa2_workload
      use precision
Andreas Marek's avatar
Andreas Marek committed
93
      use iso_c_binding
94
      use redist
95
      implicit none
96
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
97
      class(elpa_abstract_impl_t), intent(inout)   :: obj
98
      logical, intent(in)                          :: useGPU, wantDebug
99
      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
100
#ifdef USE_ASSUMED_SIZE
101
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,*)
102
#else
103
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,matrixCols)
104
#endif
Andreas Marek's avatar
Andreas Marek committed
105
      integer(kind=c_intptr_t)                     :: a_dev
106
      real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
107
      MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
108 109 110 111

      real(kind=rk)                     :: vnorm2
      MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)
112 113 114 115 116 117 118 119 120 121 122 123 124 125

      integer(kind=ik)                             :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
      integer(kind=ik)                             :: my_pe, n_pes, mpierr
      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
      integer(kind=ik)                             :: ireq_ab, ireq_hv
      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
!      integer(kind=ik)                            :: my_mpi_status(MPI_STATUS_SIZE)
#endif
!      integer(kind=ik), allocatable               :: mpi_statuses(:,:), global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: omp_block_limits(:)
126
      MATH_DATATYPE(kind=rck), allocatable        :: hv_t(:,:), tau_t(:)
127 128 129 130 131
      integer(kind=ik)                             :: omp_get_max_threads
#endif /* WITH_OPENMP */
      integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
      integer(kind=ik), allocatable                :: block_limits(:)
132
      MATH_DATATYPE(kind=rck), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
133 134 135 136 137 138
      integer                                      :: istat
      character(200)                               :: errorMessage

#ifndef WITH_MPI
      integer(kind=ik)                             :: startAddr
#endif
139
      call obj%timer%start("tridiag_band_&
140
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
141
      &" // &
142 143
      &PRECISION_SUFFIX &
      )
144
      if (wantDebug) call obj%timer%start("mpi_communication")
145 146
      call mpi_comm_rank(communicator,my_pe,mpierr)
      call mpi_comm_size(communicator,n_pes,mpierr)
147 148 149 150 151

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
152
      if (wantDebug) call obj%timer%stop("mpi_communication")
153 154 155 156 157 158

      ! Get global_id mapping 2D procssor coordinates to global id

      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
159 160
  &MATH_DATATYPE&
  &: error when allocating global_id "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
161
        stop 1
162 163 164 165 166 167 168 169 170
      endif

      global_id(:,:) = 0
      global_id(my_prow, my_pcol) = my_pe

#ifdef WITH_OPENMP
      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
171 172
  &MATH_DATATYPE&
  &: error when allocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
173
        stop 1
174 175 176 177
      endif
#endif

#ifdef WITH_MPI
178
      if (wantDebug) call obj%timer%start("mpi_communication")
179
#ifndef WITH_OPENMP
180
      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
181 182
#else
      global_id_tmp(:,:) = global_id(:,:)
183
      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
184 185 186
      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
187 188
  &MATH_DATATYPE&
  &: error when deallocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
189
        stop 1
190 191
      endif
#endif /* WITH_OPENMP */
192
      if (wantDebug) call obj%timer%stop("mpi_communication")
193 194 195 196 197 198 199 200 201 202 203
#endif /* WITH_MPI */

      ! Total number of blocks in the band:

      nblocks_total = (na-1)/nb + 1

      ! Set work distribution

      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
204 205
  &MATH_DATATYPE&
  &: error when allocating block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
206
        stop 1
207 208
      endif

209
      call divide_band(obj,nblocks_total, n_pes, block_limits)
210 211 212 213 214 215 216 217 218

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)

      ! allocate the part of the band matrix which is needed by this PE
      ! The size is 1 block larger than needed to avoid extensive shifts
      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
219 220
  &MATH_DATATYPE&
  &: error when allocating ab"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
221
        stop 1
222 223 224 225 226 227 228 229
      endif

      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety

      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb

      ! Redistribute band in a to ab
Andreas Marek's avatar
Andreas Marek committed
230 231 232 233
      call redist_band_&
      &MATH_DATATYPE&
      &_&
      &PRECISION&
234
      &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
235

236 237 238 239 240 241
      ! Calculate the workload for each sweep in the back transformation
      ! and the space requirements to hold the HH vectors

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
242 243
  &MATH_DATATYPE&
  &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
244
        stop 1
245 246
      endif

247
      call determine_workload(obj,na, nb, np_rows, limits)
248 249 250 251 252 253
      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      do n = 1, nblocks_total
254
        call determine_workload(obj, nx, nb, np_rows, limits)
255 256
        local_size = limits(my_prow+1) - limits(my_prow)
        ! add to number of householder vectors
257
        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
258 259 260 261 262 263 264 265 266
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_hh_vecs = num_hh_vecs + local_size
          num_chunks  = num_chunks+1
        endif
        nx = nx - nb
      enddo

      ! Allocate space for HH vectors

267
      allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
268
      if (istat .ne. 0) then
269 270
#if REALCASE == 1
        print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
271 272
#endif
#if COMPLEXCASE == 1
273 274
        print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
275
        stop 1
276 277 278 279 280 281 282
      endif

      ! Allocate and init MPI requests

      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
283 284
  &MATH_DATATYPE&
  &: error when allocating ireq_hhr"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
285
        stop 1
286 287 288 289
      endif
      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
290 291
  &MATH_DATATYEP&
  &: error when allocating ireq_hhs"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
292
        stop 1
293 294 295 296 297 298 299
      endif

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      nt = 0
      do n = 1, nblocks_total
300
        call determine_workload(obj,nx, nb, np_rows, limits)
301 302 303 304
        local_size = limits(my_prow+1) - limits(my_prow)
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_chunks  = num_chunks+1
#ifdef WITH_MPI
305
          if (wantDebug) call obj%timer%start("mpi_communication")
306
          call mpi_irecv(hh_trans(1,num_hh_vecs+1),          &
Andreas Marek's avatar
Retab  
Andreas Marek committed
307
                   nb*local_size,                           &
308
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
309
       MPI_REAL_PRECISION,                     &
310 311 312 313
#endif
#if COMPLEXCASE == 1
                        MPI_COMPLEX_EXPLICIT_PRECISION,           &
#endif
314
                        nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
315
          if (wantDebug) call obj%timer%stop("mpi_communication")
316 317 318

#else /* WITH_MPI */
          ! carefull non-block recv data copy must be done at wait or send
319 320
          ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)

321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
#endif /* WITH_MPI */
          num_hh_vecs = num_hh_vecs + local_size
        endif
        nx = nx - nb
        if (n == block_limits(nt+1)) then
          nt = nt + 1
        endif
      enddo
#ifdef WITH_MPI
      ireq_hhs(:) = MPI_REQUEST_NULL
#endif
      ! Buffers for gathering/sending the HH vectors

      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
337 338
  &MATH_DATATYPE&
  &: error when allocating hh_gath"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
339
        stop 1
340 341 342 343 344
      endif

      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
345 346
  &MATH_DATATYPE&
  &: error when allocating hh_send"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
347
        stop 1
348 349
      endif

350 351 352
      hh_gath(:,:,:) = 0.0_rck
      hh_send(:,:,:) = 0.0_rck

353 354 355 356 357
      ! Some counters

      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
358 359
  &MATH_DATATYPE&
  &: error when allocating hh_cnt"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
360
        stop 1
361 362 363 364 365
      endif

      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
366 367
  &MATH_DATATYPE&
  &: error when allocating hh_dst"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
368
        stop 1
369 370
      endif

371
      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
372 373 374 375 376 377 378 379 380 381
      hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! Limits for sending

      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
382 383
  &MATH_DATATYPE&
  &: error when allocating snd_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
384
        stop 1
385 386
      endif
      do iblk=1,nblocks
387
        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
      enddo

#ifdef WITH_OPENMP
      ! OpenMP work distribution:

      max_threads = 1
#if REALCASE == 1
      max_threads = omp_get_max_threads()
#endif
#if COMPLEXCASE == 1
!$      max_threads = omp_get_max_threads()
#endif
      ! For OpenMP we need at least 2 blocks for every thread
      max_threads = MIN(max_threads, nblocks/2)
      if (max_threads==0) max_threads = 1

      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
407 408
  &MATH_DATATYPE&
  &: error when allocating omp_block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
409
        stop 1
410 411 412
      endif

      ! Get the OpenMP block limits
413
      call divide_band(obj,nblocks, max_threads, omp_block_limits)
414 415 416 417

      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
418 419
  &MATH_DATATYPE&
  &: error when allocating hv_t, tau_t"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
420
        stop 1
421 422
      endif

423 424
      hv_t = 0.0_rck
      tau_t = 0.0_rck
425 426 427 428 429 430 431 432 433 434 435 436
#endif /* WITH_OPENMP */

      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first column to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
437
        if (wantDebug) call obj%timer%start("mpi_communication")
438 439
        call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
440
                 MPI_REAL_PRECISION,  &
441 442 443 444
#endif
#if COMPLEXCASE == 1
                       MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
445
           my_pe-1, 1, communicator, ireq_ab, mpierr)
446
        if (wantDebug) call obj%timer%stop("mpi_communication")
447 448 449 450
#endif /* WITH_MPI */
      endif

#ifndef WITH_MPI
451
          startAddr = ubound(hh_trans,dim=2)
452
#endif /* WITH_MPI */
453 454 455 456 457 458 459 460 461

#ifdef WITH_OPENMP
      do istep=1,na-1-block_limits(my_pe)*nb
#else
      do istep=1,na-1
#endif

        if (my_pe==0) then
          n = MIN(na-na_s,nb) ! number of rows to be reduced
462 463
          hv(:) = 0.0_rck
          tau = 0.0_rck
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486

          ! Transform first column of remaining matrix
#if REALCASE == 1
          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
#endif
#if COMPLEXCASE == 1
         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
         ! the last subdiagonal element a real number
#endif

#if REALCASE == 1
          if (istep < na-1) then
            ! Transform first column of remaining matrix
            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
          if (n<2) vnorm2 = 0. ! Safety only
487
#endif /* COMPLEXCASE */
488

489 490 491 492
#if REALCASE == 1
            call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
493
            call hh_transform_complex_&
494 495
#endif
&PRECISION &
496
                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
497

498
            hv(1) = 1.0_rck
499
            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
500
#if REALCASE == 1
501 502
          endif
#endif
503 504 505 506
          d(istep) = ab(1,na_s-n_off)
          e(istep) = ab(2,na_s-n_off)
          if (istep == na-1) then
            d(na) = ab(1,na_s+1-n_off)
507
            e(na) = 0.0_rck
508 509 510
          endif
        else
          if (na>na_s) then
511
            ! Receive Householder Vector from previous task, from PE owning subdiagonal
512 513 514 515

#ifdef WITH_OPENMP

#ifdef WITH_MPI
516
            if (wantDebug) call obj%timer%start("mpi_communication")
517 518 519 520 521 522 523
            call mpi_recv(hv, nb,       &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
524
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
525
            if (wantDebug) call obj%timer%stop("mpi_communication")
526 527 528 529 530 531 532 533 534 535

#else /* WITH_MPI */

            hv(1:nb) = hv_s(1:nb)

#endif /* WITH_MPI */

#else /* WITH_OPENMP */

#ifdef WITH_MPI
536
            if (wantDebug) call obj%timer%start("mpi_communication")
537 538 539 540 541 542 543 544

            call mpi_recv(hv, nb,          &
#if REALCASE == 1
                          MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
                          MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
545
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
546
            if (wantDebug) call obj%timer%stop("mpi_communication")
547 548 549 550 551 552 553

#else /* WITH_MPI */
            hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */

#endif /* WITH_OPENMP */
            tau = hv(1)
554
            hv(1) = 1.0_rck
555 556 557 558 559 560
          endif
        endif

        na_s = na_s+1
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
561
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
          n_off = n_off + nb
        endif

#ifdef WITH_OPENMP
        if (max_threads > 1) then

          ! Codepath for OpenMP

          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
          ! This simulates the behaviour of the MPI tasks which also work after each other.
          ! The code would be considerably easier, if the MPI communication would be made within
          ! the parallel region - this is avoided here since this would require
          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.

          hv_t(:,1) = hv
          tau_t(1) = tau

          do iter = 1, 2

            ! iter=1 : work on first block
            ! iter=2 : work on remaining blocks
            ! This is done in 2 iterations so that we have a barrier in between:
            ! After the first iteration, it is guaranteed that the last row of the last block
            ! is completed by the next thread.
            ! After the first iteration it is also the place to exchange the last row
            ! with MPI calls
589
            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612

!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
            do my_thread = 1, max_threads

              if (iter == 1) then
                my_block_s = omp_block_limits(my_thread-1) + 1
                my_block_e = my_block_s
              else
                my_block_s = omp_block_limits(my_thread-1) + 2
                my_block_e = omp_block_limits(my_thread)
              endif

              do iblk = my_block_s, my_block_e

                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
                ne = ns+nb-1                    ! last column in block

                if (istep<my_thread .or. ns+n_off>na) exit

                hv = hv_t(:,my_thread)
                tau = tau_t(my_thread)

613
                ! Store Householder Vector for back transformation
614 615 616 617 618 619 620 621 622 623 624

                hh_cnt(iblk) = hh_cnt(iblk) + 1

                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

                ! Transform diagonal block
625
                if (wantDebug) call obj%timer%start("blas")
626
#if REALCASE == 1
627
                call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
628 629
#endif
#if COMPLEXCASE == 1
630
                call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
631
#endif
632
                if (wantDebug) call obj%timer%stop("blas")
633 634 635 636 637 638
#if REALCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*tau
#endif
#if COMPLEXCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
#endif
Pavel Kus's avatar
Pavel Kus committed
639
                hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
640
                if (wantDebug) call obj%timer%start("blas")
641
#if REALCASE == 1
642
                call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
643 644
#endif
#if COMPLEXCASE == 1
645
                call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
646
#endif
647
                if (wantDebug) call obj%timer%stop("blas")
648 649
                hv_t(:,my_thread) = 0.0_rck
                tau_t(my_thread)  = 0.0_rck
650 651 652
                if (nr<=0) cycle ! No subdiagonal block present any more

                ! Transform subdiagonal block
653
                if (wantDebug) call obj%timer%start("blas")
654
                call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
655
                if (wantDebug) call obj%timer%stop("blas")
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
                if (nr>1) then

                  ! complete (old) Householder transformation for first column

                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

                  ! calculate new Householder transformation for first column
                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))

#if REALCASE == 1
                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
674
#endif /* COMPLEXCASE */
675

676 677 678 679 680 681
#if REALCASE == 1
                  call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
                  call hh_transform_complex_&
#endif
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
682
                  &PRECISION &
683
                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
684

685
                  hv_t(1   ,my_thread) = 1.0_rck
686
                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
687
                  ab(nb+2:,ns) = 0.0_rck
688 689
                  ! update subdiagonal block for old and new Householder transformation
                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
690
                  if (wantDebug) call obj%timer%start("blas")
691 692
                  call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,            &
                          nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, ZERO, h(2), 1)
693
                  if (wantDebug) call obj%timer%stop("blas")
694 695 696 697 698 699 700

                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
                  h(2:nb) = h(2:nb) - x*hv(2:nb)
                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
                  do i=2,nb
                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
701
                                      h(i) - hs(1:nr)*hv(i)
702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
#endif
#if COMPLEXCASE == 1
                                                  conjg(h(i)) - hs(1:nr)*conjg(hv(i))
#endif
                  enddo

                else

                  ! No new Householder transformation for nr=1, just complete the old one
                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
                  do i=2,nb
#if REALCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
#endif
#if COMPLEXCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
#endif
                  enddo
                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
721
                  hv_t(1,my_thread) = 1.0_rck
722 723 724 725 726 727 728
                endif

              enddo

            enddo ! my_thread
!$omp end parallel do

729
            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
730 731 732 733 734 735 736

            if (iter==1) then
              ! We are at the end of the first block

              ! Send our first column to previous PE
              if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
737
                if (wantDebug) call obj%timer%start("mpi_communication")
738
                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
739
                if (wantDebug) call obj%timer%stop("mpi_communication")
740 741 742 743

#endif
                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
744
                if (wantDebug) call obj%timer%start("mpi_communication")
745 746
                call mpi_isend(ab_s, nb+1,  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
747
                   MPI_REAL_PRECISION, &
748 749 750 751
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
752
             my_pe-1, 1, communicator, ireq_ab, mpierr)
753
                if (wantDebug) call obj%timer%stop("mpi_communication")
754 755 756 757 758 759 760

#endif /* WITH_MPI */
              endif

              ! Request last column from next PE
              ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
761
              if (wantDebug) call obj%timer%start("mpi_communication")
762 763 764 765

              if (istep>=max_threads .and. ne <= na) then
                call mpi_recv(ab(1,ne-n_off), nb+1,       &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
766
                  MPI_REAL_PRECISION, &
767 768 769 770
#endif
#if COMPLEXCASE == 1
                              MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
771
                              my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
772
              endif
773
              if (wantDebug) call obj%timer%stop("mpi_communication")
774 775 776 777 778 779 780 781
#else /* WITH_MPI */
              if (istep>=max_threads .and. ne <= na) then
                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
              endif
#endif /* WITH_MPI */
            else
              ! We are at the end of all blocks

782
              ! Send last HH Vector and TAU to next PE if it has been calculated above
783 784 785
              ne = na_s + nblocks*nb - (max_threads-1) - 1
              if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
786
                if (wantDebug) call obj%timer%start("mpi_communication")
787
                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
788
                if (wantDebug) call obj%timer%stop("mpi_communication")
789 790 791 792 793
#endif
                hv_s(1) = tau_t(max_threads)
                hv_s(2:) = hv_t(2:,max_threads)

#ifdef WITH_MPI
794
                if (wantDebug) call obj%timer%start("mpi_communication")
795 796
                call mpi_isend(hv_s, nb,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
797
                   MPI_REAL_PRECISION, &
798 799 800 801
#endif
#if COMPLEXCASE == 1
                               MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
802
                               my_pe+1, 2, communicator, ireq_hv, mpierr)
803
                if (wantDebug) call obj%timer%stop("mpi_communication")
804 805 806 807

#endif /* WITH_MPI */
              endif

808
              ! "Send" HH Vector and TAU to next OpenMP thread
809 810 811 812 813 814 815 816 817 818 819 820 821 822 823
              do my_thread = max_threads, 2, -1
                hv_t(:,my_thread) = hv_t(:,my_thread-1)
                tau_t(my_thread)  = tau_t(my_thread-1)
              enddo

            endif
          enddo ! iter

        else

          ! Codepath for 1 thread without OpenMP

          ! The following code is structured in a way to keep waiting times for
          ! other PEs at a minimum, especially if there is only one block.
          ! For this reason, it requests the last column as late as possible
824
          ! and sends the Householder Vector and the first column as early
825 826 827 828 829 830 831 832 833 834
          ! as possible.

#endif /* WITH_OPENMP */

          do iblk=1,nblocks
            ns = na_s + (iblk-1)*nb - n_off ! first column in block
            ne = ns+nb-1                    ! last column in block

            if (ns+n_off>na) exit

835
            ! Store Householder Vector for back transformation
836 837 838 839 840 841 842 843 844 845

            hh_cnt(iblk) = hh_cnt(iblk) + 1

            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

#ifndef WITH_OPENMP
            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
              ! Wait for last transfer to finish
#ifdef WITH_MPI
846
              if (wantDebug) call obj%timer%start("mpi_communication")
847 848

              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
849
              if (wantDebug) call obj%timer%stop("mpi_communication")
850 851 852 853 854 855
#endif
              ! Copy vectors into send buffer
              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
              ! Send to destination

#ifdef WITH_MPI
856
              if (wantDebug) call obj%timer%start("mpi_communication")
857 858
              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk),  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
859
                       MPI_REAL_PRECISION, &
860 861 862 863 864
#endif
#if COMPLEXCASE == 1
                             MPI_COMPLEX_EXPLICIT_PRECISION, &
#endif
                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
865
                             10+iblk, communicator, ireq_hhs(iblk), mpierr)
866
              if (wantDebug) call obj%timer%stop("mpi_communication")
867 868 869
#else /* WITH_MPI */
             ! do the post-poned irecv here
             startAddr = startAddr - hh_cnt(iblk)
870
             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
871 872 873 874 875 876 877 878 879 880
#endif /* WITH_MPI */

            ! Reset counter and increase destination row
              hh_cnt(iblk) = 0
              hh_dst(iblk) = hh_dst(iblk)+1
            endif

            ! The following code is structured in a way to keep waiting times for
            ! other PEs at a minimum, especially if there is only one block.
            ! For this reason, it requests the last column as late as possible
881
            ! and sends the Householder Vector and the first column as early
882 883 884 885 886 887
            ! as possible.
#endif /* WITH_OPENMP */
            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

888
            ! Multiply diagonal block and subdiagonal block with Householder Vector
889 890 891 892 893 894 895

            if (iblk==nblocks .and. nc==nb) then

              ! We need the last column from the next PE.
              ! First do the matrix multiplications without last column ...

              ! Diagonal block, the contribution of the last element is added below!
896
              ab(1,ne) = 0.0_rck
897
              if (wantDebug) call obj%timer%start("blas")
898 899

#if REALCASE == 1
900
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
901 902
#endif
#if COMPLEXCASE == 1
903
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd,1)
904 905
#endif
              ! Subdiagonal block
906
              if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
907
              if (wantDebug) call obj%timer%stop("blas")
908 909 910

              ! ... then request last column ...
#ifdef WITH_MPI
911
              if (wantDebug) call obj%timer%start("mpi_communication")
912 913 914
#ifdef WITH_OPENMP
              call mpi_recv(ab(1,ne), nb+1,    &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
915
                      MPI_REAL_PRECISION, &
916 917 918 919
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
920
          my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
921 922 923
#else /* WITH_OPENMP */
              call mpi_recv(ab(1,ne), nb+1,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
924
                      MPI_REAL_PRECISION, &
925 926 927 928
#endif
#if COMPLEXCASE == 1
                            MPI_COMPLEX_EXPLICIT_PRECISION,  &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
929
                      my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
930
#endif /* WITH_OPENMP */
931
              if (wantDebug) call obj%timer%stop("mpi_communication")
932 933 934 935 936 937 938 939 940 941 942 943 944
#else /* WITH_MPI */

              ab(1:nb+1,ne) = ab_s(1:nb+1)

#endif /* WITH_MPI */

              ! ... and complete the result
              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau

            else

              ! Normal matrix multiply
945
              if (wantDebug) call obj%timer%start("blas")
946
#if REALCASE == 1
947
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
948 949
#endif
#if COMPLEXCASE == 1
950
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
951
#endif
952
              if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
953
              if (wantDebug) call obj%timer%stop("blas")
954 955 956 957
            endif

            ! Calculate first column of subdiagonal block and calculate new
            ! Householder transformation for this column
958 959
            hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
            tau_new = 0.0_rck