elpa2_tridiag_band_template.F90 45.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53

Andreas Marek's avatar
Andreas Marek committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
subroutine tridiag_band_&
  &MATH_DATATYPE&
  &_&
  &PRECISION &
  (obj, na, nb, nblk, a_mat, lda, d, e, matrixCols, &
  hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
  !-------------------------------------------------------------------------------
  ! tridiag_band_real/complex:
  ! Reduces a real symmetric band matrix to tridiagonal form
  !
  !  na          Order of matrix a
  !
  !  nb          Semi bandwith
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  a_mat(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  ! hh_trans : housholder vectors
  !
  !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
  !
  !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !  communicator
  !              MPI-Communicator for the total processor set
  !-------------------------------------------------------------------------------
  use elpa_abstract_impl
  use elpa2_workload
  use precision
90
  use, intrinsic :: iso_c_binding
Andreas Marek's avatar
Andreas Marek committed
91
  use redist
92
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
93
  use omp_lib
94
#endif
Andreas Marek's avatar
Andreas Marek committed
95
96
97
  use elpa_blas_interfaces
  use elpa_skewsymmetric_blas
  implicit none
98
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
99
100
101
102
103
  class(elpa_abstract_impl_t), intent(inout)   :: obj
  logical, intent(in)                          :: useGPU, wantDebug
  integer(kind=c_int)                          :: skewsymmetric
  logical                                      :: isSkewsymmetric
  integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
104
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
105
  MATH_DATATYPE(kind=rck), intent(in)         :: a_mat(lda,*)
106
#else
Andreas Marek's avatar
Andreas Marek committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  MATH_DATATYPE(kind=rck), intent(in)         :: a_mat(lda,matrixCols)
#endif
  real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
  MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)

  real(kind=rk)                     :: vnorm2
  MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
  MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)

  integer(kind=ik)                             :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
  integer(kind=ik)                             :: my_pe, n_pes
  integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
  integer(kind=MPI_KIND)                       :: my_peMPI, n_pesMPI, mpierr
  integer(kind=MPI_KIND)                       :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
  integer(kind=MPI_KIND)                       :: ireq_ab, ireq_hv
  integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
  integer(kind=ik), intent(in)                 :: nrThreads
124
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
125
  integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
126
127
#ifdef WITH_MPI
#endif
Andreas Marek's avatar
Andreas Marek committed
128
129
130
  integer(kind=ik), allocatable                :: global_id_tmp(:,:)
  integer(kind=ik), allocatable                :: omp_block_limits(:)
  MATH_DATATYPE(kind=rck), allocatable         :: hv_t(:,:), tau_t(:)
131
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
132
133
134
135
136
137
138
139
140
  integer(kind=ik), allocatable                :: global_id(:,:), hh_cnt(:), hh_dst(:)
  integer(kind=MPI_KIND), allocatable          :: ireq_hhr(:), ireq_hhs(:)
  integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
  integer(kind=ik), allocatable                :: block_limits(:)
  MATH_DATATYPE(kind=rck), allocatable         :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
  integer                                      :: istat
  integer(kind=ik)                             :: nblockEnd
  character(200)                               :: errorMessage
  character(20)                                :: gpuString
141
142

#ifndef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  integer(kind=ik)                             :: startAddr
#endif

  call obj%get("is_skewsymmetric",skewsymmetric,istat)
  if (istat .ne. ELPA_OK) then
       print *,"Problem getting option for skewsymmetric settings. Aborting..."
       stop
  endif
  isSkewsymmetric = (skewsymmetric == 1)
  
  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif

  call obj%timer%start("tridiag_band_&
  &MATH_DATATYPE&
  &" // &
  &PRECISION_SUFFIX //&
  gpuString)

  if (wantDebug) call obj%timer%start("mpi_communication")
  call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
  call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)

  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI ,mpierr)
  call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI ,mpierr)

  my_pe = int(my_peMPI,kind=MPI_KIND)
  n_pes = int(n_pesMPI,kind=MPI_KIND)
  my_prow = int(my_prowMPI,kind=MPI_KIND)
  np_rows = int(np_rowsMPI,kind=MPI_KIND)
  my_pcol = int(my_pcolMPI,kind=MPI_KIND)
  np_cols = int(np_colsMPI,kind=MPI_KIND)
  if (wantDebug) call obj%timer%stop("mpi_communication")

  ! Get global_id mapping 2D procssor coordinates to global id

  allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: global_id", istat, errorMessage)

  global_id(:,:) = 0
  global_id(my_prow, my_pcol) = my_pe
189

190
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
191
192
  allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: global_id_tmp", istat, errorMessage)
193
194
195
#endif

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
196
  if (wantDebug) call obj%timer%start("mpi_communication")
197
#ifndef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
198
  call mpi_allreduce(mpi_in_place, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
199
                         mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
200
#else
Andreas Marek's avatar
Andreas Marek committed
201
202
203
204
205
  global_id_tmp(:,:) = global_id(:,:)
  call mpi_allreduce(global_id_tmp, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
                     mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
  deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
  check_deallocate("tridiag_band: global_id_tmp", istat, errorMessage)
206
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
207
  if (wantDebug) call obj%timer%stop("mpi_communication")
208
209
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
210
  ! Total number of blocks in the band:
211

Andreas Marek's avatar
Andreas Marek committed
212
  nblocks_total = (na-1)/nb + 1
213

Andreas Marek's avatar
Andreas Marek committed
214
  ! Set work distribution
215

Andreas Marek's avatar
Andreas Marek committed
216
217
  allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: block_limits", istat, errorMessage)
218

Andreas Marek's avatar
Andreas Marek committed
219
  call divide_band(obj,nblocks_total, n_pes, block_limits)
220

Andreas Marek's avatar
Andreas Marek committed
221
222
  ! nblocks: the number of blocks for my task
  nblocks = block_limits(my_pe+1) - block_limits(my_pe)
223

Andreas Marek's avatar
Andreas Marek committed
224
225
226
227
  ! allocate the part of the band matrix which is needed by this PE
  ! The size is 1 block larger than needed to avoid extensive shifts
  allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: ab", istat, errorMessage)
228

Andreas Marek's avatar
Andreas Marek committed
229
  ab = 0.0_rck ! needed for lower half, the extra block should also be set to 0 for safety
230

Andreas Marek's avatar
Andreas Marek committed
231
232
  ! n_off: Offset of ab within band
  n_off = block_limits(my_pe)*nb
233

Andreas Marek's avatar
Andreas Marek committed
234
235
236
237
238
239
  ! Redistribute band in a to ab
  call redist_band_&
  &MATH_DATATYPE&
  &_&
  &PRECISION&
  &(obj,a_mat, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
Andreas Marek's avatar
Andreas Marek committed
240

Andreas Marek's avatar
Andreas Marek committed
241
242
  ! Calculate the workload for each sweep in the back transformation
  ! and the space requirements to hold the HH vectors
243

Andreas Marek's avatar
Andreas Marek committed
244
245
  allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: limits", istat, errorMessage)
246

Andreas Marek's avatar
Andreas Marek committed
247
248
  call determine_workload(obj,na, nb, np_rows, limits)
  max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
249

Andreas Marek's avatar
Andreas Marek committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
  num_hh_vecs = 0
  num_chunks  = 0
  nx = na
  do n = 1, nblocks_total
    call determine_workload(obj, nx, nb, np_rows, limits)
    local_size = limits(my_prow+1) - limits(my_prow)
    ! add to number of householder vectors
    ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
    if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
      num_hh_vecs = num_hh_vecs + local_size
      num_chunks  = num_chunks+1
    endif
    nx = nx - nb
  enddo
264

Andreas Marek's avatar
Andreas Marek committed
265
  ! Allocate space for HH vectors
266

Andreas Marek's avatar
Andreas Marek committed
267
268
  allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: hh_trans", istat, errorMessage)
269

Andreas Marek's avatar
Andreas Marek committed
270
  ! Allocate and init MPI requests
271

Andreas Marek's avatar
Andreas Marek committed
272
273
274
275
  allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
  check_allocate("tridiag_band: ireq_hhr", istat, errorMessage)
  allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
  check_allocate("tridiag_band: ireq_hhs", istat, errorMessage)
276

Andreas Marek's avatar
Andreas Marek committed
277
278
279
280
281
282
283
284
285
286
  num_hh_vecs = 0
  num_chunks  = 0
  nx = na
  nt = 0
  nBlockEnd=1
  do n = 1, nblocks_total
    call determine_workload(obj,nx, nb, np_rows, limits)
    local_size = limits(my_prow+1) - limits(my_prow)
    if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
      num_chunks  = num_chunks+1
287
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
288
289
290
291
292
      if (wantDebug) call obj%timer%start("mpi_communication")
      call mpi_irecv(hh_trans(1,num_hh_vecs+1), int(nb*local_size,kind=MPI_KIND),  MPI_MATH_DATATYPE_PRECISION_EXPL,     &
                    int(nt,kind=MPI_KIND), int(10+n-block_limits(nt),kind=MPI_KIND), &
                    int(communicator,kind=MPI_KIND), ireq_hhr(num_chunks), mpierr)
      if (wantDebug) call obj%timer%stop("mpi_communication")
293
294

#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
295
296
      ! carefull non-block recv data copy must be done at wait or send
      ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
297

298
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
299
300
301
302
303
304
305
      num_hh_vecs = num_hh_vecs + local_size
    endif
    nx = nx - nb
    if (n == block_limits(nt+1)) then
      nt = nt + 1
    endif
  enddo
306
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
307
  ireq_hhs(:) = MPI_REQUEST_NULL
308
#endif
Andreas Marek's avatar
Andreas Marek committed
309
  ! Buffers for gathering/sending the HH vectors
310

Andreas Marek's avatar
Andreas Marek committed
311
312
  allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
  check_allocate("tridiag_band: hh_gath", istat, errorMessage)
313

Andreas Marek's avatar
Andreas Marek committed
314
315
  allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
  check_allocate("tridiag_band: hh_send", istat, errorMessage)
316

Andreas Marek's avatar
Andreas Marek committed
317
318
  hh_gath(:,:,:) = 0.0_rck
  hh_send(:,:,:) = 0.0_rck
319

Andreas Marek's avatar
Andreas Marek committed
320
  ! Some counters
321

Andreas Marek's avatar
Andreas Marek committed
322
323
  allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: hh_cnt", istat, errorMessage)
324

Andreas Marek's avatar
Andreas Marek committed
325
326
  allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: hh_dst", istat, errorMessage)
327

Andreas Marek's avatar
Andreas Marek committed
328
329
  hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
  hh_dst(:) = 0 ! PE number for receive
330
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
331
332
  ireq_ab = MPI_REQUEST_NULL
  ireq_hv = MPI_REQUEST_NULL
333
#endif
Andreas Marek's avatar
Andreas Marek committed
334
  ! Limits for sending
335

Andreas Marek's avatar
Andreas Marek committed
336
337
  allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: snd_limits", istat, errorMessage)
338

Andreas Marek's avatar
Andreas Marek committed
339
340
341
  do iblk=1,nblocks
    call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
  enddo
342

343
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
344
345
346
347
348
  ! OpenMP work distribution:
  max_threads = nrThreads
  ! For OpenMP we need at least 2 blocks for every thread
  max_threads = MIN(max_threads, nblocks/2)
  if (max_threads==0) max_threads = 1
349

Andreas Marek's avatar
Andreas Marek committed
350
351
  allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: omp_block_limits", istat, errorMessage)
352

Andreas Marek's avatar
Andreas Marek committed
353
354
  ! Get the OpenMP block limits
  call divide_band(obj,nblocks, max_threads, omp_block_limits)
355

Andreas Marek's avatar
Andreas Marek committed
356
357
  allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
  check_allocate("tridiag_band: hv_t", istat, errorMessage)
358

Andreas Marek's avatar
Andreas Marek committed
359
360
  hv_t = 0.0_rck
  tau_t = 0.0_rck
361
#endif /* WITH_OPENMP_TRADITIONAL */
362

Andreas Marek's avatar
Andreas Marek committed
363
364
  ! ---------------------------------------------------------------------------
  ! Start of calculations
365

Andreas Marek's avatar
Andreas Marek committed
366
  na_s = block_limits(my_pe)*nb + 1
367

Andreas Marek's avatar
Andreas Marek committed
368
369
370
371
  if (my_pe>0 .and. na_s<=na) then
    ! send first column to previous PE
    ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
    ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
372
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
373
374
375
376
    if (wantDebug) call obj%timer%start("mpi_communication")
    call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                   int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
    if (wantDebug) call obj%timer%stop("mpi_communication")
377
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
378
  endif
379
380

#ifndef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
381
  startAddr = ubound(hh_trans,dim=2)
382
#endif /* WITH_MPI */
383

384
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
385
   do istep=1,na-nblockEnd-block_limits(my_pe)*nb
386
#else
Andreas Marek's avatar
Andreas Marek committed
387
   do istep=1,na-nblockEnd
388
389
#endif

Andreas Marek's avatar
Andreas Marek committed
390
391
392
393
394
     if (my_pe==0) then
       n = MIN(na-na_s,nb) ! number of rows to be reduced
       hv(:) = 0.0_rck
       hd(:) = 0.0_rck
       tau = 0.0_rck
395

Andreas Marek's avatar
Andreas Marek committed
396
       ! Transform first column of remaining matrix
397
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
398
399
       ! The last step (istep=na-1) is only needed for sending the last HH vectors.
       ! We don't want the sign of the last element flipped (analogous to the other sweeps)
400
401
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
402
403
       ! Opposed to the real case, the last step (istep=na-1) is needed here for making
       ! the last subdiagonal element a real number
404
405
406
#endif

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
407
408
409
       if (istep < na-1) then
         ! Transform first column of remaining matrix
         vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
410
411
412
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
413
         vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
414
#else
Andreas Marek's avatar
Andreas Marek committed
415
         vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
416
#endif
Andreas Marek's avatar
Andreas Marek committed
417
         if (n<2) vnorm2 = 0.0_rk ! Safety only
418
#endif /* COMPLEXCASE */
419

Andreas Marek's avatar
Andreas Marek committed
420
421
422
423
424
          call hh_transform_&
               &MATH_DATATYPE&
               &_&
               &PRECISION &
                          (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
425

Andreas Marek's avatar
Andreas Marek committed
426
427
          hv(1) = 1.0_rck
          hv(2:n) = ab(3:n+1,na_s-n_off)*hf
428
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
429
       endif
430
#endif
431
432

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
433
434
435
436
437
438
       if (isSkewsymmetric) then
         d(istep) = 0.0_rk
       else
         d(istep) = ab(1,na_s-n_off)
       endif
       e(istep) = ab(2,na_s-n_off)
439
440
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
441
442
       d(istep) = real(ab(1,na_s-n_off), kind=rk)
       e(istep) = real(ab(2,na_s-n_off), kind=rk)
443
444
#endif

Andreas Marek's avatar
Andreas Marek committed
445
       if (istep == na-1) then
446
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
447
448
449
450
451
         if (isSkewsymmetric) then
           d(na) = 0
         else
           d(na) = ab(1,na_s+1-n_off)
         endif
452
#endif
453

454
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
455
         d(na) = real(ab(1,na_s+1-n_off),kind=rk)
456
#endif
Andreas Marek's avatar
Andreas Marek committed
457
458
459
460
461
         e(na) = 0.0_rck
       endif
     else
       if (na>na_s) then
         ! Receive Householder Vector from previous task, from PE owning subdiagonal
462

463
#ifdef WITH_OPENMP_TRADITIONAL
464
465

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
466
467
468
469
470
         if (wantDebug) call obj%timer%start("mpi_communication")
         call mpi_recv(hv, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                       int(my_pe-1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
                       MPI_STATUS_IGNORE, mpierr)
         if (wantDebug) call obj%timer%stop("mpi_communication")
471
472
473

#else /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
474
         hv(1:nb) = hv_s(1:nb)
475
476
477

#endif /* WITH_MPI */

478
#else /* WITH_OPENMP_TRADITIONAL */
479
480

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
481
         if (wantDebug) call obj%timer%start("mpi_communication")
482

Andreas Marek's avatar
Andreas Marek committed
483
484
485
486
         call mpi_recv(hv, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                       int(my_pe-1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
                       MPI_STATUS_IGNORE, mpierr)
         if (wantDebug) call obj%timer%stop("mpi_communication")
487
488

#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
489
         hv(1:nb) = hv_s(1:nb)
490
491
#endif /* WITH_MPI */

492
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
493
494
495
496
         tau = hv(1)
         hv(1) = 1.0_rck
       endif
      endif
497

Andreas Marek's avatar
Andreas Marek committed
498
499
500
501
502
503
      na_s = na_s+1
      if (na_s-n_off > nb) then
        ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
        ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
        n_off = n_off + nb
      endif
504

505
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
506
      if (max_threads > 1) then
507

Andreas Marek's avatar
Andreas Marek committed
508
        ! Codepath for OpenMP
509

Andreas Marek's avatar
Andreas Marek committed
510
511
512
513
514
515
        ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
        ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
        ! This simulates the behaviour of the MPI tasks which also work after each other.
        ! The code would be considerably easier, if the MPI communication would be made within
        ! the parallel region - this is avoided here since this would require
        ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
516

Andreas Marek's avatar
Andreas Marek committed
517
518
        hv_t(:,1) = hv
        tau_t(1) = tau
519

Andreas Marek's avatar
Andreas Marek committed
520
        do iter = 1, 2
521

Andreas Marek's avatar
Andreas Marek committed
522
523
524
525
526
527
528
529
          ! iter=1 : work on first block
          ! iter=2 : work on remaining blocks
          ! This is done in 2 iterations so that we have a barrier in between:
          ! After the first iteration, it is guaranteed that the last row of the last block
          ! is completed by the next thread.
          ! After the first iteration it is also the place to exchange the last row
          ! with MPI calls
          call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
530

531
532
533
534
535
536
537
!$omp parallel do &
!$omp default(none) &
!$omp private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp&        nc, nr, hs, hd, vnorm2, hf, x, h, i) &
!$omp shared(max_threads, obj, ab, isSkewsymmetric, wantDebug, hh_gath, &
!$omp        hh_cnt, tau_t, hv_t, na, istep, n_off, na_s, nb, omp_block_limits, iter) &
!$omp         schedule(static,1), num_threads(max_threads)
Andreas Marek's avatar
Andreas Marek committed
538
          do my_thread = 1, max_threads
539

Andreas Marek's avatar
Andreas Marek committed
540
541
542
543
544
545
546
            if (iter == 1) then
              my_block_s = omp_block_limits(my_thread-1) + 1
              my_block_e = my_block_s
            else
              my_block_s = omp_block_limits(my_thread-1) + 2
              my_block_e = omp_block_limits(my_thread)
            endif
547

Andreas Marek's avatar
Andreas Marek committed
548
            do iblk = my_block_s, my_block_e
549

Andreas Marek's avatar
Andreas Marek committed
550
551
              ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
              ne = ns+nb-1                    ! last column in block
552

Andreas Marek's avatar
Andreas Marek committed
553
              if (istep<my_thread .or. ns+n_off>na) exit
554

Andreas Marek's avatar
Andreas Marek committed
555
556
              hv = hv_t(:,my_thread)
              tau = tau_t(my_thread)
557

Andreas Marek's avatar
Andreas Marek committed
558
              ! Store Householder Vector for back transformation
559

Andreas Marek's avatar
Andreas Marek committed
560
              hh_cnt(iblk) = hh_cnt(iblk) + 1
561

Andreas Marek's avatar
Andreas Marek committed
562
563
              hh_gath(1   ,hh_cnt(iblk),iblk) = tau
              hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
564

Andreas Marek's avatar
Andreas Marek committed
565
566
567
              nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
              nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                        ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
568

Andreas Marek's avatar
Andreas Marek committed
569
570
              ! Transform diagonal block
              if (wantDebug) call obj%timer%start("blas")
571
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
572
573
574
575
576
              if (isSkewsymmetric) then
                hd(:) = 0.0_rk
                call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
              else
                call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
577
                                      hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
Andreas Marek's avatar
Andreas Marek committed
578
              endif
579
580
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
581
              call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
582
                                    hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
583
#endif
Andreas Marek's avatar
Andreas Marek committed
584
              if (wantDebug) call obj%timer%stop("blas")
585
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
586
587
588
              if (.NOT. isSkewsymmetric) then
                x = dot_product(hv(1:nc),hd(1:nc))*tau
              endif
589
590
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
591
              x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
592
#endif
Andreas Marek's avatar
Andreas Marek committed
593
594
595
596
              if (.NOT. isSkewsymmetric) then
                hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
              endif
              if (wantDebug) call obj%timer%start("blas")
597
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
598
599
              if (isSkewsymmetric) then
                call ELPA_PRECISION_SSR2(int(nc,kind=BLAS_KIND), hd,  hv, ab(1,ns), &
600
                                           int(2*nb-1,kind=BLAS_KIND) )
Andreas Marek's avatar
Andreas Marek committed
601
602
              else
                call PRECISION_SYR2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND, &
603
                                    hv, 1_BLAS_KIND, ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
Andreas Marek's avatar
Andreas Marek committed
604
              endif
605
606
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
607
608
              call PRECISION_HER2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND, &
                                  hv, 1_BLAS_KIND, ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
609
#endif
Andreas Marek's avatar
Andreas Marek committed
610
611
612
613
              if (wantDebug) call obj%timer%stop("blas")
              hv_t(:,my_thread) = 0.0_rck
              tau_t(my_thread)  = 0.0_rck
              if (nr<=0) cycle ! No subdiagonal block present any more
614

Andreas Marek's avatar
Andreas Marek committed
615
616
617
618
619
620
621
              ! Transform subdiagonal block
              if (wantDebug) call obj%timer%start("blas")
              call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), tau, &
                                  ab(nb+1,ns), int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, &
                                  ZERO, hs, 1_BLAS_KIND)
              if (wantDebug) call obj%timer%stop("blas")
              if (nr>1) then
622

Andreas Marek's avatar
Andreas Marek committed
623
              ! complete (old) Householder transformation for first column
624

Andreas Marek's avatar
Andreas Marek committed
625
              ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
626

Andreas Marek's avatar
Andreas Marek committed
627
628
              ! calculate new Householder transformation for first column
              ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
629
630

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
631
              vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
632
633
634
#endif
#if COMPLEXCASE == 1
#ifdef  DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
635
              vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
636
#else
Andreas Marek's avatar
Andreas Marek committed
637
              vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
638
#endif
639
#endif /* COMPLEXCASE */
640

Andreas Marek's avatar
Andreas Marek committed
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
              call hh_transform_&
              &MATH_DATATYPE&
              &_&
              &PRECISION &
              (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)

              hv_t(1   ,my_thread) = 1.0_rck
              hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
              ab(nb+2:,ns) = 0.0_rck
              ! update subdiagonal block for old and new Householder transformation
              ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
              if (wantDebug) call obj%timer%start("blas")
              call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,            &
                                  int(nr,kind=BLAS_KIND), int(nb-1,kind=BLAS_KIND), &
                                  tau_t(my_thread), ab(nb,ns+1), int(2*nb-1,kind=BLAS_KIND), &
                                  hv_t(1,my_thread), 1_BLAS_KIND, ZERO, h(2), 1_BLAS_KIND)
              if (wantDebug) call obj%timer%stop("blas")

              x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
              h(2:nb) = h(2:nb) - x*hv(2:nb)
              ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
              do i=2,nb
                ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
664
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
665
                h(i) - hs(1:nr)*hv(i)
666
667
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
668
                conjg(h(i)) - hs(1:nr)*conjg(hv(i))
669
#endif
Andreas Marek's avatar
Andreas Marek committed
670
              enddo
671

Andreas Marek's avatar
Andreas Marek committed
672
            else
673

Andreas Marek's avatar
Andreas Marek committed
674
675
676
              ! No new Householder transformation for nr=1, just complete the old one
              ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
              do i=2,nb
677
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
678
                ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
679
680
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
681
                ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
682
#endif
Andreas Marek's avatar
Andreas Marek committed
683
684
685
686
              enddo
              ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
              hv_t(1,my_thread) = 1.0_rck
            endif
687

Andreas Marek's avatar
Andreas Marek committed
688
          enddo
689

Andreas Marek's avatar
Andreas Marek committed
690
691
        enddo ! my_thread
        !$omp end parallel do
692

Andreas Marek's avatar
Andreas Marek committed
693
        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
694

Andreas Marek's avatar
Andreas Marek committed
695
696
        if (iter==1) then
          ! We are at the end of the first block
697

Andreas Marek's avatar
Andreas Marek committed
698
699
          ! Send our first column to previous PE
          if (my_pe>0 .and. na_s <= na) then
700
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
701
702
703
            if (wantDebug) call obj%timer%start("mpi_communication")
            call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
704
705

#endif
Andreas Marek's avatar
Andreas Marek committed
706
            ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
707
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
708
709
710
711
712
            if (wantDebug) call obj%timer%start("mpi_communication")
            call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                           int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, &
                           int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
713
714

#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
715
          endif
716

Andreas Marek's avatar
Andreas Marek committed
717
718
          ! Request last column from next PE
          ne = na_s + nblocks*nb - (max_threads-1) - 1
719
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
720
          if (wantDebug) call obj%timer%start("mpi_communication")
721

Andreas Marek's avatar
Andreas Marek committed
722
723
724
725
726
727
          if (istep>=max_threads .and. ne <= na) then
            call mpi_recv(ab(1,ne-n_off), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
                          int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
                          MPI_STATUS_IGNORE, mpierr)
          endif
          if (wantDebug) call obj%timer%stop("mpi_communication")
728
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
729
730
731
          if (istep>=max_threads .and. ne <= na) then
            ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
          endif
732
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
733
734
        else
          ! We are at the end of all blocks
735

Andreas Marek's avatar
Andreas Marek committed
736
737
738
          ! Send last HH Vector and TAU to next PE if it has been calculated above
          ne = na_s + nblocks*nb - (max_threads-1) - 1
          if (istep>=max_threads .and. ne < na) then
739
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
740
741
742
            if (wantDebug) call obj%timer%start("mpi_communication")
            call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
743
#endif
Andreas Marek's avatar
Andreas Marek committed
744
745
            hv_s(1) = tau_t(max_threads)
            hv_s(2:) = hv_t(2:,max_threads)
746
747

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
748
749
750
751
752
            if (wantDebug) call obj%timer%start("mpi_communication")
            call mpi_isend(hv_s, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                           int(my_pe+1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
                           ireq_hv, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
753
754

#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
755
          endif
756

Andreas Marek's avatar
Andreas Marek committed
757
758
759
760
761
          ! "Send" HH Vector and TAU to next OpenMP thread
          do my_thread = max_threads, 2, -1
            hv_t(:,my_thread) = hv_t(:,my_thread-1)
            tau_t(my_thread)  = tau_t(my_thread-1)
          enddo
762

Andreas Marek's avatar
Andreas Marek committed
763
764
        endif
      enddo ! iter
765

Andreas Marek's avatar
Andreas Marek committed
766
    else
767

Andreas Marek's avatar
Andreas Marek committed
768
      ! Codepath for 1 thread without OpenMP
769

Andreas Marek's avatar
Andreas Marek committed
770
771
772
773
774
      ! The following code is structured in a way to keep waiting times for
      ! other PEs at a minimum, especially if there is only one block.
      ! For this reason, it requests the last column as late as possible
      ! and sends the Householder Vector and the first column as early
      ! as possible.
775

776
#endif /* WITH_OPENMP_TRADITIONAL */
777

Andreas Marek's avatar
Andreas Marek committed
778
779
780
      do iblk=1,nblocks
        ns = na_s + (iblk-1)*nb - n_off ! first column in block
        ne = ns+nb-1                    ! last column in block
781

Andreas Marek's avatar
Andreas Marek committed
782
        if (ns+n_off>na) exit
783

Andreas Marek's avatar
Andreas Marek committed
784
        ! Store Householder Vector for back transformation
785

Andreas Marek's avatar
Andreas Marek committed
786
        hh_cnt(iblk) = hh_cnt(iblk) + 1
787

Andreas Marek's avatar
Andreas Marek committed
788
789
        hh_gath(1   ,hh_cnt(iblk),iblk) = tau
        hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
790

791
#ifndef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
792
793
        if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
          ! Wait for last transfer to finish
794
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
795
          if (wantDebug) call obj%timer%start("mpi_communication")
796

Andreas Marek's avatar
Andreas Marek committed
797
798
          call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
          if (wantDebug) call obj%timer%stop("mpi_communication")
799
#endif
Andreas Marek's avatar
Andreas Marek committed
800
801
802
          ! Copy vectors into send buffer
          hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
          ! Send to destination
803
804

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
805
806
807
808
809
          if (wantDebug) call obj%timer%start("mpi_communication")
          call mpi_isend(hh_send(1,1,iblk), int(nb*hh_cnt(iblk),kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                         global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
                         int(10+iblk,kind=MPI_KIND), int(communicator,kind=MPI_KIND), ireq_hhs(iblk), mpierr)
          if (wantDebug) call obj%timer%stop("mpi_communication")
810
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
811
812
813
          ! do the post-poned irecv here
          startAddr = startAddr - hh_cnt(iblk)
          hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
814
815
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
816
817
818
819
          ! Reset counter and increase destination row
          hh_cnt(iblk) = 0
          hh_dst(iblk) = hh_dst(iblk)+1
        endif
820

Andreas Marek's avatar
Andreas Marek committed
821
822
823
824
825
        ! The following code is structured in a way to keep waiting times for
        ! other PEs at a minimum, especially if there is only one block.
        ! For this reason, it requests the last column as late as possible
        ! and sends the Householder Vector and the first column as early
        ! as possible.
826
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
827
828
829
        nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
        nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
                                      ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
830

Andreas Marek's avatar
Andreas Marek committed
831
        ! Multiply diagonal block and subdiagonal block with Householder Vector
832

Andreas Marek's avatar
Andreas Marek committed
833
        if (iblk==nblocks .and. nc==nb) then
834

Andreas Marek's avatar
Andreas Marek committed
835
836
          ! We need the last column from the next PE.
          ! First do the matrix multiplications without last column ...
837

Andreas Marek's avatar
Andreas Marek committed
838
839
840
          ! Diagonal block, the contribution of the last element is added below!
          ab(1,ne) = 0.0_rck
          if (wantDebug) call obj%timer%start("blas")
841
842

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
843
844
845
846
847
848
849
          if (isSkewsymmetric) then
            hd(:) = 0.0_rk
            call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
          else
            call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
                               hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
          endif
850
851
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
852
853
          call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
                             hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
854
#endif
Andreas Marek's avatar
Andreas Marek committed
855
856
857
858
859
          ! Subdiagonal block
          if (nr>0) call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb-1,kind=BLAS_KIND), &
                                        tau, ab(nb+1,ns), int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, &
                                        ZERO, hs, 1_BLAS_KIND)
          if (wantDebug) call obj%timer%stop("blas")
860

Andreas Marek's avatar
Andreas Marek committed
861
          ! ... then request last column ...
862
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
863
          if (wantDebug) call obj%timer%start("mpi_communication")
864
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
865
866
867
          call mpi_recv(ab(1,ne), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
                       int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
                       MPI_STATUS_IGNORE, mpierr)
868
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
869
870
871
          call mpi_recv(ab(1,ne), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
                       int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
                       MPI_STATUS_IGNORE, mpierr)
872
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
873
          if (wantDebug) call obj%timer%stop("mpi_communication")
874
875
#else /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
876
          ab(1:nb+1,ne) = ab_s(1:nb+1)
877
878
879

#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
880
881
882
          ! ... and complete the result
          hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
          hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
883

Andreas Marek's avatar
Andreas Marek committed
884
        else
885

Andreas Marek's avatar
Andreas Marek committed
886
887
          ! Normal matrix multiply
          if (wantDebug) call obj%timer%start("blas")
888
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
889
890
891
892
893
894
895
          if (isSkewsymmetric) then
            hd(:) = 0.0_rk
            call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
          else
            call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
                                hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
          endif
896
897
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
898
899
          call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
                              hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
900
#endif
Andreas Marek's avatar
Andreas Marek committed
901
902
903
904
          if (nr>0) call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), tau, ab(nb+1,ns), &
                                        int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, ZERO, hs, 1_BLAS_KIND)
          if (wantDebug) call obj%timer%stop("blas")
        endif
905

Andreas Marek's avatar
Andreas Marek committed
906
907
908
909
910
        ! Calculate first column of subdiagonal block and calculate new
        ! Householder transformation for this column
        hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
        tau_new = 0.0_rck
        if (nr>0) then
911

Andreas Marek's avatar
Andreas Marek committed
912
          ! complete (old) Householder transformation for first column
913

Andreas Marek's avatar
Andreas Marek committed
914
          ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
915

Andreas Marek's avatar
Andreas Marek committed
916
917
          ! calculate new Householder transformation ...
          if (nr>1) then
918
#if  REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
919
            vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
920
921
922
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
923
            vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
924
#else
Andreas Marek's avatar
Andreas Marek committed
925
            vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
926
#endif
927
#endif /* COMPLEXCASE */
928

Andreas Marek's avatar
Andreas Marek committed
929
            call hh_transform_&
930
931
                &MATH_DATATYPE&
                &_&
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
932
                &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
933
934
935
936
937
               (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug)
            hv_new(1) = 1.0_rck
            hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
            ab(nb+2:,ns) = 0.0_rck
          endif ! nr > 1