elpa2_tridiag_band_template.F90 44.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

54 55 56
    subroutine tridiag_band_&
    &MATH_DATATYPE&
    &_&
57
    &PRECISION &
58
    (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, &
59
    hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
60 61 62 63 64 65 66 67 68 69
    !-------------------------------------------------------------------------------
    ! tridiag_band_real/complex:
    ! Reduces a real symmetric band matrix to tridiagonal form
    !
    !  na          Order of matrix a
    !
    !  nb          Semi bandwith
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
Andreas Marek's avatar
Andreas Marek committed
70
    !  aMatrix(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
71 72 73 74
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a
    !
75
    ! hh_trans : housholder vectors
76 77 78 79 80 81 82 83
    !
    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
84
    !  communicator
85 86
    !              MPI-Communicator for the total processor set
    !-------------------------------------------------------------------------------
87
      use elpa_abstract_impl
88 89
      use elpa2_workload
      use precision
Andreas Marek's avatar
Andreas Marek committed
90
      use iso_c_binding
91
      use redist
92 93 94
#ifdef WITH_OPENMP
      use omp_lib
#endif
95
      implicit none
96
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
97
      class(elpa_abstract_impl_t), intent(inout)   :: obj
98
      logical, intent(in)                          :: useGPU, wantDebug
99
      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
100
#ifdef USE_ASSUMED_SIZE
101
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,*)
102
#else
103
      MATH_DATATYPE(kind=rck), intent(in)         :: aMatrix(lda,matrixCols)
104
#endif
Andreas Marek's avatar
Andreas Marek committed
105
      integer(kind=c_intptr_t)                     :: a_dev
106
      real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
107
      MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
108 109 110 111

      real(kind=rk)                     :: vnorm2
      MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
      MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)
112

Andreas Marek's avatar
Andreas Marek committed
113
      integer(kind=ik)                             :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
114 115 116 117
      integer(kind=ik)                             :: my_pe, n_pes, mpierr
      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
      integer(kind=ik)                             :: ireq_ab, ireq_hv
      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
118
      integer(kind=ik), intent(in)                 :: nrThreads
119 120 121 122 123 124
#ifdef WITH_OPENMP
      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
#endif
      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
      integer(kind=ik), allocatable                :: omp_block_limits(:)
125
      MATH_DATATYPE(kind=rck), allocatable        :: hv_t(:,:), tau_t(:)
126 127 128 129
#endif /* WITH_OPENMP */
      integer(kind=ik), allocatable                :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
      integer(kind=ik), allocatable                :: block_limits(:)
130
      MATH_DATATYPE(kind=rck), allocatable        :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
131 132 133 134 135 136
      integer                                      :: istat
      character(200)                               :: errorMessage

#ifndef WITH_MPI
      integer(kind=ik)                             :: startAddr
#endif
137
      call obj%timer%start("tridiag_band_&
138
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
139
      &" // &
140 141
      &PRECISION_SUFFIX &
      )
142
      if (wantDebug) call obj%timer%start("mpi_communication")
143 144
      call mpi_comm_rank(communicator,my_pe,mpierr)
      call mpi_comm_size(communicator,n_pes,mpierr)
145 146 147 148 149

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
150
      if (wantDebug) call obj%timer%stop("mpi_communication")
151 152 153 154 155 156

      ! Get global_id mapping 2D procssor coordinates to global id

      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
157 158
                 &MATH_DATATYPE&
                 &: error when allocating global_id "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
159
        stop 1
160 161 162 163 164 165 166 167 168
      endif

      global_id(:,:) = 0
      global_id(my_prow, my_pcol) = my_pe

#ifdef WITH_OPENMP
      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
169 170
                &MATH_DATATYPE&
                &: error when allocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
171
        stop 1
172 173 174 175
      endif
#endif

#ifdef WITH_MPI
176
      if (wantDebug) call obj%timer%start("mpi_communication")
177
#ifndef WITH_OPENMP
178
      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
179 180
#else
      global_id_tmp(:,:) = global_id(:,:)
181
      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr)
182 183 184
      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
185 186
                 &MATH_DATATYPE&
                 &: error when deallocating global_id_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
187
        stop 1
188 189
      endif
#endif /* WITH_OPENMP */
190
      if (wantDebug) call obj%timer%stop("mpi_communication")
191 192 193 194 195 196 197 198 199 200 201
#endif /* WITH_MPI */

      ! Total number of blocks in the band:

      nblocks_total = (na-1)/nb + 1

      ! Set work distribution

      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
202 203
                 &MATH_DATATYPE&
                 &: error when allocating block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
204
        stop 1
205 206
      endif

207
      call divide_band(obj,nblocks_total, n_pes, block_limits)
208 209 210 211 212 213 214 215 216

      ! nblocks: the number of blocks for my task
      nblocks = block_limits(my_pe+1) - block_limits(my_pe)

      ! allocate the part of the band matrix which is needed by this PE
      ! The size is 1 block larger than needed to avoid extensive shifts
      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
217 218
                 &MATH_DATATYPE&
                 &: error when allocating ab"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
219
        stop 1
220 221 222 223 224 225 226 227
      endif

      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety

      ! n_off: Offset of ab within band
      n_off = block_limits(my_pe)*nb

      ! Redistribute band in a to ab
Andreas Marek's avatar
Andreas Marek committed
228 229 230 231
      call redist_band_&
      &MATH_DATATYPE&
      &_&
      &PRECISION&
232
      &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
233

234 235 236 237 238 239
      ! Calculate the workload for each sweep in the back transformation
      ! and the space requirements to hold the HH vectors

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
240 241
                 &MATH_DATATYPE&
                 &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
242
        stop 1
243 244
      endif

245
      call determine_workload(obj,na, nb, np_rows, limits)
246 247 248 249 250 251
      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      do n = 1, nblocks_total
252
        call determine_workload(obj, nx, nb, np_rows, limits)
253 254
        local_size = limits(my_prow+1) - limits(my_prow)
        ! add to number of householder vectors
255
        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
256 257 258 259 260 261 262 263 264
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_hh_vecs = num_hh_vecs + local_size
          num_chunks  = num_chunks+1
        endif
        nx = nx - nb
      enddo

      ! Allocate space for HH vectors

265
      allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
266
      if (istat .ne. 0) then
267 268
#if REALCASE == 1
        print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
269 270
#endif
#if COMPLEXCASE == 1
271 272
        print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
273
        stop 1
274 275 276 277 278 279 280
      endif

      ! Allocate and init MPI requests

      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
281 282
                 &MATH_DATATYPE&
                 &: error when allocating ireq_hhr"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
283
        stop 1
284 285 286 287
      endif
      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
288 289
                 &MATH_DATATYEP&
                 &: error when allocating ireq_hhs"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
290
        stop 1
291 292 293 294 295 296 297
      endif

      num_hh_vecs = 0
      num_chunks  = 0
      nx = na
      nt = 0
      do n = 1, nblocks_total
298
        call determine_workload(obj,nx, nb, np_rows, limits)
299 300 301 302
        local_size = limits(my_prow+1) - limits(my_prow)
        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
          num_chunks  = num_chunks+1
#ifdef WITH_MPI
303
          if (wantDebug) call obj%timer%start("mpi_communication")
304
          call mpi_irecv(hh_trans(1,num_hh_vecs+1), nb*local_size,  MPI_MATH_DATATYPE_PRECISION_EXPL,     &
305
                        nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr)
306
          if (wantDebug) call obj%timer%stop("mpi_communication")
307 308 309

#else /* WITH_MPI */
          ! carefull non-block recv data copy must be done at wait or send
310 311
          ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
#endif /* WITH_MPI */
          num_hh_vecs = num_hh_vecs + local_size
        endif
        nx = nx - nb
        if (n == block_limits(nt+1)) then
          nt = nt + 1
        endif
      enddo
#ifdef WITH_MPI
      ireq_hhs(:) = MPI_REQUEST_NULL
#endif
      ! Buffers for gathering/sending the HH vectors

      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
328 329
                 &MATH_DATATYPE&
                 &: error when allocating hh_gath"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
330
        stop 1
331 332 333 334 335
      endif

      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
336 337
                &MATH_DATATYPE&
                &: error when allocating hh_send"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
338
        stop 1
339 340
      endif

341 342 343
      hh_gath(:,:,:) = 0.0_rck
      hh_send(:,:,:) = 0.0_rck

344 345 346 347 348
      ! Some counters

      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
349 350
                &MATH_DATATYPE&
                &: error when allocating hh_cnt"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
351
        stop 1
352 353 354 355 356
      endif

      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
357 358
                &MATH_DATATYPE&
                &: error when allocating hh_dst"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
359
        stop 1
360 361
      endif

362
      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
363 364 365 366 367 368 369 370 371 372
      hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
      ireq_ab = MPI_REQUEST_NULL
      ireq_hv = MPI_REQUEST_NULL
#endif
      ! Limits for sending

      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
373 374
                &MATH_DATATYPE&
                &: error when allocating snd_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
375
        stop 1
376 377
      endif
      do iblk=1,nblocks
378
        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
379 380 381 382
      enddo

#ifdef WITH_OPENMP
      ! OpenMP work distribution:
383
      max_threads = nrThreads
384 385 386 387 388 389 390
      ! For OpenMP we need at least 2 blocks for every thread
      max_threads = MIN(max_threads, nblocks/2)
      if (max_threads==0) max_threads = 1

      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
391 392
                 &MATH_DATATYPE&
                 &: error when allocating omp_block_limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
393
        stop 1
394 395 396
      endif

      ! Get the OpenMP block limits
397
      call divide_band(obj,nblocks, max_threads, omp_block_limits)
398 399 400 401

      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_band_&
Andreas Marek's avatar
Andreas Marek committed
402 403
                &MATH_DATATYPE&
                &: error when allocating hv_t, tau_t"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
404
        stop 1
405 406
      endif

407 408
      hv_t = 0.0_rck
      tau_t = 0.0_rck
409 410 411 412 413 414 415 416 417 418 419 420
#endif /* WITH_OPENMP */

      ! ---------------------------------------------------------------------------
      ! Start of calculations

      na_s = block_limits(my_pe)*nb + 1

      if (my_pe>0 .and. na_s<=na) then
        ! send first column to previous PE
        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
421
        if (wantDebug) call obj%timer%start("mpi_communication")
422
        call mpi_isend(ab_s, nb+1, MPI_MATH_DATATYPE_PRECISION_EXPL, &
Andreas Marek's avatar
Retab  
Andreas Marek committed
423
           my_pe-1, 1, communicator, ireq_ab, mpierr)
424
        if (wantDebug) call obj%timer%stop("mpi_communication")
425 426 427 428
#endif /* WITH_MPI */
      endif

#ifndef WITH_MPI
429
          startAddr = ubound(hh_trans,dim=2)
430
#endif /* WITH_MPI */
431 432 433 434 435 436 437 438 439

#ifdef WITH_OPENMP
      do istep=1,na-1-block_limits(my_pe)*nb
#else
      do istep=1,na-1
#endif

        if (my_pe==0) then
          n = MIN(na-na_s,nb) ! number of rows to be reduced
440 441
          hv(:) = 0.0_rck
          tau = 0.0_rck
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464

          ! Transform first column of remaining matrix
#if REALCASE == 1
          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
#endif
#if COMPLEXCASE == 1
         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
         ! the last subdiagonal element a real number
#endif

#if REALCASE == 1
          if (istep < na-1) then
            ! Transform first column of remaining matrix
            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
          if (n<2) vnorm2 = 0. ! Safety only
465
#endif /* COMPLEXCASE */
466

467 468 469 470
            call hh_transform_&
                &MATH_DATATYPE&
                &_&
                &PRECISION &
471
                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
472

473
            hv(1) = 1.0_rck
474
            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
475
#if REALCASE == 1
476 477
          endif
#endif
478 479

#if REALCASE == 1
480 481
          d(istep) = ab(1,na_s-n_off)
          e(istep) = ab(2,na_s-n_off)
482 483 484 485 486 487
#endif
#if COMPLEXCASE == 1
          d(istep) = real(ab(1,na_s-n_off), kind=rk)
          e(istep) = real(ab(2,na_s-n_off), kind=rk)
#endif

488
          if (istep == na-1) then
489
#if REALCASE == 1
490
            d(na) = ab(1,na_s+1-n_off)
491 492 493 494
#endif
#if COMPLEXCASE == 1
            d(na) = real(ab(1,na_s+1-n_off),kind=rk)
#endif
495
            e(na) = 0.0_rck
496 497 498
          endif
        else
          if (na>na_s) then
499
            ! Receive Householder Vector from previous task, from PE owning subdiagonal
500 501 502 503

#ifdef WITH_OPENMP

#ifdef WITH_MPI
504
            if (wantDebug) call obj%timer%start("mpi_communication")
505
            call mpi_recv(hv, nb, MPI_MATH_DATATYPE_PRECISION_EXPL, &
506
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
507
            if (wantDebug) call obj%timer%stop("mpi_communication")
508 509 510 511 512 513 514 515 516 517

#else /* WITH_MPI */

            hv(1:nb) = hv_s(1:nb)

#endif /* WITH_MPI */

#else /* WITH_OPENMP */

#ifdef WITH_MPI
518
            if (wantDebug) call obj%timer%start("mpi_communication")
519

520
            call mpi_recv(hv, nb, MPI_MATH_DATATYPE_PRECISION_EXPL, &
521
                          my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr)
522
            if (wantDebug) call obj%timer%stop("mpi_communication")
523 524 525 526 527 528 529

#else /* WITH_MPI */
            hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */

#endif /* WITH_OPENMP */
            tau = hv(1)
530
            hv(1) = 1.0_rck
531 532 533 534 535 536
          endif
        endif

        na_s = na_s+1
        if (na_s-n_off > nb) then
          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
537
          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
          n_off = n_off + nb
        endif

#ifdef WITH_OPENMP
        if (max_threads > 1) then

          ! Codepath for OpenMP

          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
          ! This simulates the behaviour of the MPI tasks which also work after each other.
          ! The code would be considerably easier, if the MPI communication would be made within
          ! the parallel region - this is avoided here since this would require
          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.

          hv_t(:,1) = hv
          tau_t(1) = tau

          do iter = 1, 2

            ! iter=1 : work on first block
            ! iter=2 : work on remaining blocks
            ! This is done in 2 iterations so that we have a barrier in between:
            ! After the first iteration, it is guaranteed that the last row of the last block
            ! is completed by the next thread.
            ! After the first iteration it is also the place to exchange the last row
            ! with MPI calls
565
            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588

!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
            do my_thread = 1, max_threads

              if (iter == 1) then
                my_block_s = omp_block_limits(my_thread-1) + 1
                my_block_e = my_block_s
              else
                my_block_s = omp_block_limits(my_thread-1) + 2
                my_block_e = omp_block_limits(my_thread)
              endif

              do iblk = my_block_s, my_block_e

                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
                ne = ns+nb-1                    ! last column in block

                if (istep<my_thread .or. ns+n_off>na) exit

                hv = hv_t(:,my_thread)
                tau = tau_t(my_thread)

589
                ! Store Householder Vector for back transformation
590 591 592 593 594 595 596 597 598 599 600

                hh_cnt(iblk) = hh_cnt(iblk) + 1

                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

                ! Transform diagonal block
601
                if (wantDebug) call obj%timer%start("blas")
602
#if REALCASE == 1
603
                call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
604 605
#endif
#if COMPLEXCASE == 1
606
                call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
607
#endif
608
                if (wantDebug) call obj%timer%stop("blas")
609 610 611 612 613 614
#if REALCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*tau
#endif
#if COMPLEXCASE == 1
                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
#endif
Pavel Kus's avatar
Pavel Kus committed
615
                hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
616
                if (wantDebug) call obj%timer%start("blas")
617
#if REALCASE == 1
618
                call PRECISION_SYR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
619 620
#endif
#if COMPLEXCASE == 1
621
                call PRECISION_HER2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
622
#endif
623
                if (wantDebug) call obj%timer%stop("blas")
624 625
                hv_t(:,my_thread) = 0.0_rck
                tau_t(my_thread)  = 0.0_rck
626 627 628
                if (nr<=0) cycle ! No subdiagonal block present any more

                ! Transform subdiagonal block
629
                if (wantDebug) call obj%timer%start("blas")
630
                call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
631
                if (wantDebug) call obj%timer%stop("blas")
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
                if (nr>1) then

                  ! complete (old) Householder transformation for first column

                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

                  ! calculate new Householder transformation for first column
                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))

#if REALCASE == 1
                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
650
#endif /* COMPLEXCASE */
651

652 653 654
                  call hh_transform_&
                  &MATH_DATATYPE&
                  &_&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
655
                  &PRECISION &
656
                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
657

658
                  hv_t(1   ,my_thread) = 1.0_rck
659
                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
660
                  ab(nb+2:,ns) = 0.0_rck
661 662
                  ! update subdiagonal block for old and new Householder transformation
                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
663
                  if (wantDebug) call obj%timer%start("blas")
664 665
                  call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,            &
                          nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, ZERO, h(2), 1)
666
                  if (wantDebug) call obj%timer%stop("blas")
667 668 669 670 671 672 673

                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
                  h(2:nb) = h(2:nb) - x*hv(2:nb)
                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
                  do i=2,nb
                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
674
                                      h(i) - hs(1:nr)*hv(i)
675 676
#endif
#if COMPLEXCASE == 1
677
                                      conjg(h(i)) - hs(1:nr)*conjg(hv(i))
678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
#endif
                  enddo

                else

                  ! No new Householder transformation for nr=1, just complete the old one
                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
                  do i=2,nb
#if REALCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
#endif
#if COMPLEXCASE == 1
                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
#endif
                  enddo
                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
694
                  hv_t(1,my_thread) = 1.0_rck
695 696 697 698 699 700 701
                endif

              enddo

            enddo ! my_thread
!$omp end parallel do

702
            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
703 704 705 706 707 708 709

            if (iter==1) then
              ! We are at the end of the first block

              ! Send our first column to previous PE
              if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
710
                if (wantDebug) call obj%timer%start("mpi_communication")
711
                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
712
                if (wantDebug) call obj%timer%stop("mpi_communication")
713 714 715 716

#endif
                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
717
                if (wantDebug) call obj%timer%start("mpi_communication")
718
                call mpi_isend(ab_s, nb+1, MPI_MATH_DATATYPE_PRECISION_EXPL, &
Andreas Marek's avatar
Retab  
Andreas Marek committed
719
             my_pe-1, 1, communicator, ireq_ab, mpierr)
720
                if (wantDebug) call obj%timer%stop("mpi_communication")
721 722 723 724 725 726 727

#endif /* WITH_MPI */
              endif

              ! Request last column from next PE
              ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
728
              if (wantDebug) call obj%timer%start("mpi_communication")
729 730

              if (istep>=max_threads .and. ne <= na) then
731
                call mpi_recv(ab(1,ne-n_off), nb+1, MPI_MATH_DATATYPE_PRECISION_EXPL,  &
732
                              my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
733
              endif
734
              if (wantDebug) call obj%timer%stop("mpi_communication")
735 736 737 738 739 740 741 742
#else /* WITH_MPI */
              if (istep>=max_threads .and. ne <= na) then
                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
              endif
#endif /* WITH_MPI */
            else
              ! We are at the end of all blocks

743
              ! Send last HH Vector and TAU to next PE if it has been calculated above
744 745 746
              ne = na_s + nblocks*nb - (max_threads-1) - 1
              if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
747
                if (wantDebug) call obj%timer%start("mpi_communication")
748
                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
749
                if (wantDebug) call obj%timer%stop("mpi_communication")
750 751 752 753 754
#endif
                hv_s(1) = tau_t(max_threads)
                hv_s(2:) = hv_t(2:,max_threads)

#ifdef WITH_MPI
755
                if (wantDebug) call obj%timer%start("mpi_communication")
756
                call mpi_isend(hv_s, nb, MPI_MATH_DATATYPE_PRECISION_EXPL, &
757
                               my_pe+1, 2, communicator, ireq_hv, mpierr)
758
                if (wantDebug) call obj%timer%stop("mpi_communication")
759 760 761 762

#endif /* WITH_MPI */
              endif

763
              ! "Send" HH Vector and TAU to next OpenMP thread
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
              do my_thread = max_threads, 2, -1
                hv_t(:,my_thread) = hv_t(:,my_thread-1)
                tau_t(my_thread)  = tau_t(my_thread-1)
              enddo

            endif
          enddo ! iter

        else

          ! Codepath for 1 thread without OpenMP

          ! The following code is structured in a way to keep waiting times for
          ! other PEs at a minimum, especially if there is only one block.
          ! For this reason, it requests the last column as late as possible
779
          ! and sends the Householder Vector and the first column as early
780 781 782 783 784 785 786 787 788 789
          ! as possible.

#endif /* WITH_OPENMP */

          do iblk=1,nblocks
            ns = na_s + (iblk-1)*nb - n_off ! first column in block
            ne = ns+nb-1                    ! last column in block

            if (ns+n_off>na) exit

790
            ! Store Householder Vector for back transformation
791 792 793 794 795 796 797 798 799 800

            hh_cnt(iblk) = hh_cnt(iblk) + 1

            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)

#ifndef WITH_OPENMP
            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
              ! Wait for last transfer to finish
#ifdef WITH_MPI
801
              if (wantDebug) call obj%timer%start("mpi_communication")
802 803

              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
804
              if (wantDebug) call obj%timer%stop("mpi_communication")
805 806 807 808 809 810
#endif
              ! Copy vectors into send buffer
              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
              ! Send to destination

#ifdef WITH_MPI
811
              if (wantDebug) call obj%timer%start("mpi_communication")
812
              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_MATH_DATATYPE_PRECISION_EXPL, &
813
                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
814
                             10+iblk, communicator, ireq_hhs(iblk), mpierr)
815
              if (wantDebug) call obj%timer%stop("mpi_communication")
816 817 818
#else /* WITH_MPI */
             ! do the post-poned irecv here
             startAddr = startAddr - hh_cnt(iblk)
819
             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
820 821 822 823 824 825 826 827 828 829
#endif /* WITH_MPI */

            ! Reset counter and increase destination row
              hh_cnt(iblk) = 0
              hh_dst(iblk) = hh_dst(iblk)+1
            endif

            ! The following code is structured in a way to keep waiting times for
            ! other PEs at a minimum, especially if there is only one block.
            ! For this reason, it requests the last column as late as possible
830
            ! and sends the Householder Vector and the first column as early
831 832 833 834 835 836
            ! as possible.
#endif /* WITH_OPENMP */
            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!

837
            ! Multiply diagonal block and subdiagonal block with Householder Vector
838 839 840 841 842 843 844

            if (iblk==nblocks .and. nc==nb) then

              ! We need the last column from the next PE.
              ! First do the matrix multiplications without last column ...

              ! Diagonal block, the contribution of the last element is added below!
845
              ab(1,ne) = 0.0_rck
846
              if (wantDebug) call obj%timer%start("blas")
847 848

#if REALCASE == 1
849
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
850 851
#endif
#if COMPLEXCASE == 1
852
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd,1)
853 854
#endif
              ! Subdiagonal block
855
              if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
856
              if (wantDebug) call obj%timer%stop("blas")
857 858 859

              ! ... then request last column ...
#ifdef WITH_MPI
860
              if (wantDebug) call obj%timer%start("mpi_communication")
861
#ifdef WITH_OPENMP
862
              call mpi_recv(ab(1,ne), nb+1, MPI_MATH_DATATYPE_PRECISION_EXPL,  &
Andreas Marek's avatar
Retab  
Andreas Marek committed
863
          my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
864
#else /* WITH_OPENMP */
865
              call mpi_recv(ab(1,ne), nb+1, MPI_MATH_DATATYPE_PRECISION_EXPL,  &
Andreas Marek's avatar
Retab  
Andreas Marek committed
866
                      my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr)
867
#endif /* WITH_OPENMP */
868
              if (wantDebug) call obj%timer%stop("mpi_communication")
869 870 871 872 873 874 875 876 877 878 879 880 881
#else /* WITH_MPI */

              ab(1:nb+1,ne) = ab_s(1:nb+1)

#endif /* WITH_MPI */

              ! ... and complete the result
              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau

            else

              ! Normal matrix multiply
882
              if (wantDebug) call obj%timer%start("blas")
883
#if REALCASE == 1
884
              call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
885 886
#endif
#if COMPLEXCASE == 1
887
              call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
888
#endif
889
              if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, ZERO, hs, 1)
890
              if (wantDebug) call obj%timer%stop("blas")
891 892 893 894
            endif

            ! Calculate first column of subdiagonal block and calculate new
            ! Householder transformation for this column
895 896
            hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
            tau_new = 0.0_rck
897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
            if (nr>0) then

              ! complete (old) Householder transformation for first column

              ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1

              ! calculate new Householder transformation ...
              if (nr>1) then
#if  REALCASE == 1
                vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
                vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
                vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
914
#endif /* COMPLEXCASE */
915

916 917 918
                call hh_transform_&
                &MATH_DATATYPE&
                &_&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
919
                &PRECISION &
920
                             (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug)
921
                hv_new(1) = 1.0_rck
922
                hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf