elpa1_tridiag_template.X90 42 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
56
57
58
59
60
61

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
!> \param na          Order of matrix
!>
62
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
63
64
65
66
67
68
69
70
71
72
73
74
75
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
76
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
77
!>
78
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
79
80
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
81
82
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
83
!>
84

85
86
87
88
89
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
90
91
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
92
93
#ifdef HAVE_DETAILED_TIMINGS
      use timings
94
95
#else
      use timings_dummy
Pavel Kus's avatar
Pavel Kus committed
96
97
98
99
#endif
      use precision
      implicit none

100
101
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      logical, intent(in)                           :: useGPU
102

103
104
105
106
107
108
109
110
111
112
113
114
115
116
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
118
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
120
121
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
122
#endif
123
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
124

125
      integer(kind=ik), parameter                   :: max_stored_uv = 32
126
      logical,          parameter                   :: mat_vec_as_one_block = .true.
Pavel Kus's avatar
Pavel Kus committed
127
128

      ! id in processor row and column and total numbers of processor rows and columns
129
130
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, my_rank
      integer(kind=ik)                              :: mpierr
131
132
133
134
135
136
137
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO=0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO=0.0_rk4, ONE = 1.0_rk4
#endif
#endif
138
139
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
140
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
141
#else
142
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
143
144
145
146
#endif
#endif
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
147
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
148
      ! local columns and rows of the remaining part of the matrix
149
150
151
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
152

153
154
155
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
156

157
158
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
159
      integer(kind=c_intptr_t)                        :: a_offset
Pavel Kus's avatar
Pavel Kus committed
160
161

#ifdef WITH_OPENMP
162
163
      integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
      integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
Pavel Kus's avatar
Pavel Kus committed
164
165
#endif

166
167
168
169
170
171
172
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
173

174
175
176
177
178
179
180
181
182
183
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
                                                       v_row(:), &   ! used to store calculated Householder vector
                                                       v_col(:), &   ! the same vector, but transposed - differently distributed among MPI tasks
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
184
185
      ! the following two matrices store pairs of vectors v and u calculated in each step
      ! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
186
      ! u and v are stored both in row and vector forms
187
188
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
189
190
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
191
      ! pattern: u1,v1,u2,v2,u3,v3,....
192
193
194
195
196
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
197

Pavel Kus's avatar
Pavel Kus committed
198
#ifdef WITH_OPENMP
199
200
201
202
203
204
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
205
206
#endif

207
208
209
210
211
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
212
      integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
213
214
215
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
216
217
      call timer%start("tridiag_&
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
218
      &" // &
219
220
221
      PRECISION_SUFFIX &
      )

222

Pavel Kus's avatar
Pavel Kus committed
223
224
225
226
227
228
      call timer%start("mpi_communication")
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")
229

Pavel Kus's avatar
Pavel Kus committed
230
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
231
232
233
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
234
      !  -----------------
235
236
237
238
239
240
241
242
243
244
245
246
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
247
      ! size of this small block is nblk
248
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
249
      ! tile_size is thus nblk * 6
250
      !
Pavel Kus's avatar
Pavel Kus committed
251
252
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
253

254
255
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
256

Pavel Kus's avatar
Pavel Kus committed
257
      totalblocks = (na-1)/nblk + 1
258
259
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
260

261
262
263
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
264

265
266
267
268
269
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
      ! todo: probably one should read it as v_row = vector v distributed among rows
      !
Pavel Kus's avatar
Pavel Kus committed
270
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
271
272
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
273

274
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
275
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
276
277
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
278

279
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
280
281
282
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

283
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
284
285
286
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

287
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
288
289
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
290
291
292
293
294

#ifdef WITH_OPENMP
      max_threads = omp_get_max_threads()

      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
295
296
297
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
298
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
299
300
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
301
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
302
303

      tmp = 0
304
305
306
307
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
308

309
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
310
311
312
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

313
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
314
315
316
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

317
      if (useGPU) then
318
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
319
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
320

321
322
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
323
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
324

325
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
326
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
327

328
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
329
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
330

331
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
332
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
333

334
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
335
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
336
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
337

338

Pavel Kus's avatar
Pavel Kus committed
339
340
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
341
342
      tau(:) = 0

343
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
344

Pavel Kus's avatar
Pavel Kus committed
345
346
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
347
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
Pavel Kus's avatar
Pavel Kus committed
348
        d_vec(na) = a_mat(l_rows,l_cols)
349

350
351
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
352

353
        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
354
        check_alloc_cuda("tridiag: a_dev", successCUDA)
355

356
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
357
        check_memcpy_cuda("tridiag: a_dev", successCUDA)
358
      endif
Pavel Kus's avatar
Pavel Kus committed
359

360
361
      ! main cycle of tridiagonalization
      ! in each step, 1 Householder vector is calculated
362
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
363

364
365
366
367
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
368

369
370
        ! Calculate vector for Householder transformation on all procs
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
371

372
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
373

374
375
          ! Get vector to be transformed; distribute last element and norm of
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
376

377
378
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
379
            a_offset = l_cols * lda * size_of_datatype
380
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
381

382
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
383
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
384
385
386
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
387

388
389
            if (n_stored_vecs > 0 .and. l_rows > 0) then
	      call timer%start("blas")
390
391
392
393
394
395
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
	                          l_rows, 2*n_stored_vecs,                                  &
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
396
#if REALCASE == 1
397
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
398
399
#endif
#if COMPLEXCASE == 1
400
401
                                   aux, 1,  &

402
#endif
403
                                  ONE, v_row, 1)
404
405
	       call timer%stop("blas")

406
            endif
407

408
            if(my_prow == prow(istep-1, nblk, np_rows)) then
409
410
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
411
            else
412
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
413
414
415
416
417
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
            call timer%start("mpi_communication")
418
419
            call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
420
421
            call timer%stop("mpi_communication")
#else /* WITH_MPI */
422
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
423
#endif /* WITH_MPI */
424
425
          vnorm2 = aux2(1)
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
426

427
          ! Householder transformation
428
#if REALCASE == 1
429
          call hh_transform_real_&
430
431
#endif
#if COMPLEXCASE == 1
432
          call hh_transform_complex_&
433
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
434
               &PRECISION &
435
                             (vrl, vnorm2, xf, tau(istep))
436
          ! Scale v_row and store Householder vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
437

438
          v_row(1:l_rows) = v_row(1:l_rows) * xf
439
          if (my_prow == prow(istep-1, nblk, np_rows)) then
440
441
442
443
444
445
446
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
            e_vec(istep-1) = vrl
          endif

          ! store Householder vector for back transformation
447
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
448

449
          ! add tau after the end of actuall v_row, to be broadcasted with it
450
          v_row(l_rows+1) = tau(istep)
451
         endif !(my_pcol == pcol(istep, nblk, np_cols))
Pavel Kus's avatar
Pavel Kus committed
452
453

#ifdef WITH_MPI
454
         ! Broadcast the Householder vector (and tau) along columns
455
         call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
456
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
457
458
#endif /* WITH_MPI */

459
460
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
461

462
        ! Transpose Householder vector v_row -> v_col
463
        call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup    
Andreas Marek committed
464
465
466
             &MATH_DATATYPE&
             &_&
             &PRECISION &
467
468
                   (v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
                    1, istep-1, 1, nblk)
469

470
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
471

472
473
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
474

475
476
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
477
        if (l_rows > 0 .and. l_cols> 0 ) then
478
          if(useGPU) then
479
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
480
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
481

482
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
483
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
484

485
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
486

Andreas Marek's avatar
Andreas Marek committed
487
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
488

489
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
490
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
491
492
          endif ! useGU

Pavel Kus's avatar
Pavel Kus committed
493
#ifdef WITH_OPENMP
494
          call timer%start("OpenMP parallel")
495
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
496

497
498
          my_thread = omp_get_thread_num()
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
499

500
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
501

502
          ! first calculate A*v part of (A + VU**T + UV**T)*v   
503
504
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
505
#endif /* WITH_OPENMP */
506
          do i= 0, (istep-2)/tile_size
507
508
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
509
510
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
511
512
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
513
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
514
#ifdef WITH_OPENMP
515
              if (mod(n_iter,n_threads) == my_thread) then
516
                call timer%start("blas")
517
518
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
519
520
521
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

522
                if (i/=j) then
523
524
525
526
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
527
                endif
528
                call timer%stop("blas")
529
530
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
531
#else /* WITH_OPENMP */
532

533
534
535
536
              ! multiplication by blocks is efficient only for CPU
              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
              ! CPU implementation) or by one large matrix vector multiply
              if (.not. useGPU) then
537
538
539
540
541
542
                call timer%start("blas")
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                            ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                            v_row(l_row_beg), 1,                           &
                            ONE, u_col(l_col_beg), 1)
543
544

                if (i/=j) then
545

546
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
547
548
549
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
550
                endif
551
552
                call timer%stop("blas")
              endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
553
554

#endif /* WITH_OPENMP */
555
            enddo  ! j=0,i
556
557
          enddo  ! i=0,(istep-2)/tile_size

558
          if (useGPU) then
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
            if(mat_vec_as_one_block) then
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
              call timer%start("cublas")
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
                                        ONE, a_dev, lda,                   &
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
!                                             ONE, a_dev + a_offset, lda,                        &
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
              call timer%stop("cublas")

            else 
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with 
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
                                          
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
                  
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
                              
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                              ONE, a_dev + a_offset, lda,  &
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
              
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
                  
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
                          
                  call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, & 
                              ONE, a_dev + a_offset, lda, &
                              v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                              ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
              enddo        
            end if !multiplication as one block / per stripes 
618
                
619
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
620
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
621

622
            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
623
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
624
          endif
625

626
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
627
!                         1.d0, a_mat, ubound(a_mat,1),  &
628
!                         v_row, 1,  &
629
!                         0.d0, u_col, 1)
630
631
632

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
633
634
#ifdef WITH_OPENMP
!$OMP END PARALLEL
635
          call timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
636

637
638
639
640
641
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
642

643
          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v          
644
          if (n_stored_vecs > 0) then
645
            call timer%start("blas")
646
#if REALCASE == 1
647
            call PRECISION_GEMV('T',     &
648
649
#endif
#if COMPLEXCASE == 1
650
            call PRECISION_GEMV('C',     &
651
#endif
652
653
654
655
656
657
658
	                        l_rows, 2*n_stored_vecs,   &
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
659
            call timer%stop("blas")
660
          endif
Pavel Kus's avatar
Pavel Kus committed
661

662
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
663

664
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
665
        ! on the processors containing the diagonal
666
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
667
668
669
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
670
671
672
673
674
675
676
677

	  call elpa_reduce_add_vectors_&
	  &MATH_DATATYPE&
	  &_&
	  &PRECISION &
          (u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
	   mpi_comm_cols, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
678
679
        endif

680
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
681
682

        if (l_cols>0) then
683
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
684
685
#ifdef WITH_MPI
          call timer%start("mpi_communication")
686
          call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
687
                             MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
688
689
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
690
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
691
692
693
#endif /* WITH_MPI */
        endif

694
695
696
697
698
699
700
        call elpa_transpose_vectors_&
	&MATH_DATATYPE&
	&_&
	&PRECISION &
        (u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
	 mpi_comm_rows, 1, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
701
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
702
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
703
        x = 0
704
        if (l_cols>0)  &
705
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
706
707
708
709
710
711
712
#endif
#if COMPLEXCASE == 1
        xc = 0
        if (l_cols>0)  &
           xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif

Pavel Kus's avatar
Pavel Kus committed
713
714
#ifdef WITH_MPI
        call timer%start("mpi_communication")
715
#if REALCASE == 1
716
        call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
717
718
#endif
#if COMPLEXCASE == 1
719
        call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
720
#endif
Pavel Kus's avatar
Pavel Kus committed
721
722
        call timer%stop("mpi_communication")
#else /* WITH_MPI */
723
724

#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
725
        vav = x
726
727
728
729
730
#endif
#if COMPLEXCASE == 1
        vav = xc
#endif

Pavel Kus's avatar
Pavel Kus committed
731
732
733
734
735
736
#endif /* WITH_MPI */

        ! store u and v in the matrices U and V
        ! these matrices are stored combined in one here

        do j=1,l_rows
737
#if REALCASE == 1
738
739
          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
740
741
742
743
744
#endif
#if COMPLEXCASE == 1
          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
745
746
        enddo
        do j=1,l_cols
747
#if REALCASE == 1
748
749
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
750
751
752
753
754
#endif
#if COMPLEXCASE == 1
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
755
756
        enddo

757
758
759
760
        ! We have calculated another Hauseholder vector, number of implicitly stored increased
        n_stored_vecs = n_stored_vecs+1

        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
761
        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
762
763
764

          if (useGPU) then
            successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
765
                                      max_local_rows * 2 * max_stored_uv *          &
766
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
767
            check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
768

769
            successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
770
                                      max_local_cols * 2 * max_stored_uv *          &
771
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
772
            check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
773
          endif
Pavel Kus's avatar
Pavel Kus committed
774

775
          do i = 0, (istep-2)/tile_size
776
            ! go over tiles above (or on) the diagonal
777
778
779
780
781
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
            l_row_beg = 1
            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
782
              cycle
783

784
            
785
            if (useGPU) then
786
787
788
789
790
791
792
793
794
795
796
797
798
799
              if(.not. mat_vec_as_one_block) then
                ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
                ! we than use the same calls as for CPU version
                call timer%start("cublas")
                call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
                                          l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
                                          ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
                                          size_of_datatype,  &
                                          max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
                                          size_of_datatype,  &
                                          max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) *            &
                                          size_of_datatype , lda)
                call timer%stop("cublas")
              endif 
800
            else !useGPU
801
802
803
              call timer%start("blas")
              call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
                                   l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,    &
804
805
806
                                   ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1),   &
                                   uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1),        &
                                   ONE, a_mat(l_row_beg,l_col_beg), lda)
807
              call timer%stop("blas")
808
            endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
809
          enddo
810
811
812
813
814
815
816
817
818
819
820
821
822
          
          if (useGPU) then
            if(mat_vec_as_one_block) then
              !update whole (remaining) part of matrix, including tiles below diagonal 
              !we can do that in one large cublas call
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs,   &
                                        ONE, vu_stored_rows_dev, max_local_rows, &
                                        uv_stored_cols_dev, max_local_cols,  &
                                        ONE, a_dev, lda)
              call timer%stop("cublas")
            endif 
          endif
Pavel Kus's avatar
Pavel Kus committed
823

824
          n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
825
826
        endif

827
        if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
828
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
829
            !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
830
             a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
831

Pavel Kus's avatar
Pavel Kus committed
832
             successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
833
                                       1 *  size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
834
             check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
835

836
          endif
837
          if (n_stored_vecs > 0) then
Pavel Kus's avatar
Pavel Kus committed
838
            a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
839
840
                        + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
          end if
Pavel Kus's avatar
Pavel Kus committed
841
          d_vec(istep-1) = a_mat(l_rows,l_cols)
842

843
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
844
            !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
845
            !successCUDA = cuda_threadsynchronize()
Andreas Marek's avatar
Andreas Marek committed
846
847
            !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)

848
849
             successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
                                       int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
850
             check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
851
          endif
Pavel Kus's avatar
Pavel Kus committed
852
853
854
855
        endif

      enddo ! main cycle over istep=na,3,-1

856
857
858
859
860
861
862
#if COMPLEXCASE == 1
      ! Store e_vec(1) and d_vec(1)

      if (my_pcol==pcol(2, nblk, np_cols)) then
        if (my_prow==prow(1, nblk, np_rows)) then
          ! We use last l_cols value of loop above
          if(useGPU) then
863
864
            successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
865
            check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
866
867
868
869
            vrl = aux3(1)
          else !useGPU
            vrl = a_mat(1,l_cols)
          endif !useGPU
870
          call hh_transform_complex_&
Pavel Kus's avatar
Pavel Kus committed
871
872
                                    &PRECISION &
                                    (vrl, CONST_REAL_0_0, xf, tau(2))
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
          e_vec(1) = vrl

          a_mat(1,l_cols) = 1. ! for consistency only
        endif
#ifdef WITH_MPI
        call timer%start("mpi_communication")
        call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
        call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      endif

#ifdef WITH_MPI
      call timer%start("mpi_communication")
      call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
      call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
        if(useGPU) then
          successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
894
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
895
          check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
896
897
898
899
900
901
902
903
904
          d_vec(1) = PRECISION_REAL(aux3(1))
        else !useGPU
          d_vec(1) = PRECISION_REAL(a_mat(1,1))
        endif !useGPU
      endif

#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
905
      ! Store e_vec(1)
906

907
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
908
        if(useGPU) then
909
910
          successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
911
          check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
912
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
913
          e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
914
915
        endif !useGPU
      endif
916

Pavel Kus's avatar
Pavel Kus committed
917
     ! Store d_vec(1)
918
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
919
        if(useGPU) then
920
          successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
921
          check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
922
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
923
          d_vec(1) = a_mat(1,1)
924
        endif !useGPU
925
      endif
926
#endif
Pavel Kus's avatar
Pavel Kus committed
927

928
      deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
Pavel Kus's avatar
Pavel Kus committed
929
      if (istat .ne. 0) then
930
#if REALCASE == 1
931
        print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
932
933
934
935
#endif
#if COMPLEXCASE == 1
        print *,"tridiag_complex: error when deallocating tmp "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
936
        stop 1
Pavel Kus's avatar
Pavel Kus committed
937
      endif
938

939
      if (useGPU) then
940
941
        ! todo: should we leave a_mat on the device for further use?
        successCUDA = cuda_free(a_dev)
Andreas Marek's avatar
Andreas Marek committed
942
        check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
943

944
        successCUDA = cuda_free(v_row_dev)
Andreas Marek's avatar
Andreas Marek committed
945
        check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
946

947
        successCUDA = cuda_free(u_row_dev)
Andreas Marek's avatar
Andreas Marek committed
948
        check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
949

950
        successCUDA = cuda_free(v_col_dev)
Andreas Marek's avatar
Andreas Marek committed
951
        check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
952

953
        successCUDA = cuda_free(u_col_dev)
Andreas Marek's avatar
Andreas Marek committed
954
        check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
955

956
        successCUDA = cuda_free(vu_stored_rows_dev)
Andreas Marek's avatar
Andreas Marek committed
957
        check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
958

959
        successCUDA = cuda_free(uv_stored_cols_dev)
Andreas Marek's avatar
Andreas Marek committed
960
        check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
961
962
      endif

Pavel Kus's avatar
Pavel Kus committed
963
      ! distribute the arrays d_vec and e_vec to all processors
Pavel Kus's avatar
Pavel Kus committed
964

965
#if REALCASE == 1
966
      allocate(tmp(na), stat=istat, errmsg=errorMessage)
967

Pavel Kus's avatar
Pavel Kus committed
968
969
      if (istat .ne. 0) then
        print *,"tridiag_real: error when allocating tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
970
        stop 1
Pavel Kus's avatar
Pavel Kus committed
971
      endif
972
973
974
975
976
#endif
#if COMPLEXCASE == 1
      allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
977
        stop 1
978
979
980
981
      endif
#endif


Pavel Kus's avatar
Pavel Kus committed
982
983
#ifdef WITH_MPI
      call timer%start("mpi_communication")
984
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
985
      tmp = d_vec
986
      call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
987
      tmp = d_vec
988
      call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
989
      tmp = e_vec
990
      call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
991
      tmp = e_vec
992
      call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
993
994
995
996
997
998
999
1000
#endif
#if COMPLEXCASE == 1
      tmp_real = d_vec
      call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
      tmp_real = d_vec
      call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
      tmp_real = e_vec
      call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)