elpa1_tridiag_template.X90 40.2 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
56
57
58
59
60
61

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
!> \param na          Order of matrix
!>
62
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
63
64
65
66
67
68
69
70
71
72
73
74
75
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
76
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
77
!>
78
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
79
80
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
81
82
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
83
!>
84

85
86
87
88
89
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
90
91
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
92
93
#ifdef HAVE_DETAILED_TIMINGS
      use timings
94
95
#else
      use timings_dummy
Pavel Kus's avatar
Pavel Kus committed
96
97
98
99
#endif
      use precision
      implicit none

100
101
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      logical, intent(in)                           :: useGPU
102

103
104
105
106
107
108
109
110
111
112
113
114
115
116
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
118
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
120
121
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
122
#endif
123
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
124

125
      integer(kind=ik), parameter                   :: max_stored_uv = 32
Pavel Kus's avatar
Pavel Kus committed
126
127

      ! id in processor row and column and total numbers of processor rows and columns
128
129
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, my_rank
      integer(kind=ik)                              :: mpierr
130
131
132
133
134
135
136
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO=0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO=0.0_rk4, ONE = 1.0_rk4
#endif
#endif
137
138
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
139
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
140
#else
141
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
142
143
144
145
#endif
#endif
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
146
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
147
      ! local columns and rows of the remaining part of the matrix
148
149
150
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
151

152
153
154
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
155

156
157
158
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
      integer(kind=c_size_t)                        :: a_offset
Pavel Kus's avatar
Pavel Kus committed
159
160

#ifdef WITH_OPENMP
161
162
      integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
      integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
Pavel Kus's avatar
Pavel Kus committed
163
164
#endif

165
166
167
168
169
170
171
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
172

173
174
175
176
177
178
179
180
181
182
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
                                                       v_row(:), &   ! used to store calculated Householder vector
                                                       v_col(:), &   ! the same vector, but transposed - differently distributed among MPI tasks
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
183
184
      ! the following two matrices store pairs of vectors v and u calculated in each step
      ! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
185
      ! u and v are stored both in row and vector forms
186
187
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
188
189
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
190
      ! pattern: u1,v1,u2,v2,u3,v3,....
191
192
193
194
195
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
196

Pavel Kus's avatar
Pavel Kus committed
197
#ifdef WITH_OPENMP
198
199
200
201
202
203
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
204
205
#endif

206
207
208
209
210
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
211
212
213
214
      integer(kind=c_size_t), parameter             :: size_of_datatype = size_of_&
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
215
216
217
218
219
220
      call timer%start("tridiag_&
      &MATH_DATATYPE&
      &_" // &
      PRECISION_SUFFIX &
      )

Pavel Kus's avatar
Pavel Kus committed
221
222
223
224
225
226
      call timer%start("mpi_communication")
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")
227

Pavel Kus's avatar
Pavel Kus committed
228
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
229
230
231
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
232
      !  -----------------
233
234
235
236
237
238
239
240
241
242
243
244
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
245
      ! size of this small block is nblk
246
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
247
      ! tile_size is thus nblk * 6
248
      !
Pavel Kus's avatar
Pavel Kus committed
249
250
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
251

252
253
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
254

Pavel Kus's avatar
Pavel Kus committed
255
      totalblocks = (na-1)/nblk + 1
256
257
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
258

259
260
261
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
262

263
264
265
266
267
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
      ! todo: probably one should read it as v_row = vector v distributed among rows
      !
Pavel Kus's avatar
Pavel Kus committed
268
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
269
270
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
271

272
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
273
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
274
275
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
276

277
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
278
279
280
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

281
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
282
283
284
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

285
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
286
287
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
288
289
290
291
292

#ifdef WITH_OPENMP
      max_threads = omp_get_max_threads()

      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
293
294
295
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
296
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
297
298
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
299
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
300
301

      tmp = 0
302
303
304
305
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
306

307
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
308
309
310
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

311
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
312
313
314
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

315
      if (useGPU) then
316
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
317
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
318

319
320
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
321
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
322

323
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
324
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
325

326
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
327
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
328

329
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
330
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
331

332
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
333
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
334
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
335

336

Pavel Kus's avatar
Pavel Kus committed
337
338
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
339
340
      tau(:) = 0

341
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
342

Pavel Kus's avatar
Pavel Kus committed
343
344
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
345
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
Pavel Kus's avatar
Pavel Kus committed
346
        d_vec(na) = a_mat(l_rows,l_cols)
347

348
349
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
350

351
        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
352
        check_alloc_cuda("tridiag: a_dev", successCUDA)
353

354
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
355
        check_alloc_cuda("tridiag: a_dev", successCUDA)
356
      endif
Pavel Kus's avatar
Pavel Kus committed
357

358
359
      ! main cycle of tridiagonalization
      ! in each step, 1 Householder vector is calculated
360
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
361

362
363
364
365
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
366

367
368
        ! Calculate vector for Householder transformation on all procs
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
369

370
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
371

372
373
          ! Get vector to be transformed; distribute last element and norm of
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
374

375
376
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
377
	    a_offset = l_cols * lda * size_of_datatype
378
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
379

380
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
381
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
382
383
384
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
385

386
387
            if (n_stored_vecs > 0 .and. l_rows > 0) then
	      call timer%start("blas")
388
389
390
391
392
393
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
	                          l_rows, 2*n_stored_vecs,                                  &
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
394
#if REALCASE == 1
395
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
396
397
#endif
#if COMPLEXCASE == 1
398
399
                                   aux, 1,  &

400
#endif
401
                                  ONE, v_row, 1)
402
403
	       call timer%stop("blas")

404
            endif
405

406
            if(my_prow == prow(istep-1, nblk, np_rows)) then
407
408
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
409
            else
410
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
411
412
413
414
415
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
            call timer%start("mpi_communication")
416
417
            call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
418
419
            call timer%stop("mpi_communication")
#else /* WITH_MPI */
420
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
421
#endif /* WITH_MPI */
422
423
          vnorm2 = aux2(1)
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
424

425
          ! Householder transformation
426
#if REALCASE == 1
427
          call hh_transform_real_&
428
429
#endif
#if COMPLEXCASE == 1
430
          call hh_transform_complex_&
431
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
432
               &PRECISION &
433
                             (vrl, vnorm2, xf, tau(istep))
434
          ! Scale v_row and store Householder vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
435

436
          v_row(1:l_rows) = v_row(1:l_rows) * xf
437
          if (my_prow == prow(istep-1, nblk, np_rows)) then
438
439
440
441
442
443
444
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
            e_vec(istep-1) = vrl
          endif

          ! store Householder vector for back transformation
445
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
446

447
          ! add tau after the end of actuall v_row, to be broadcasted with it
448
          v_row(l_rows+1) = tau(istep)
449
         endif !(my_pcol == pcol(istep, nblk, np_cols))
Pavel Kus's avatar
Pavel Kus committed
450
451

#ifdef WITH_MPI
452
         ! Broadcast the Householder vector (and tau) along columns
453
         call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
454
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
455
456
#endif /* WITH_MPI */

457
458
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
459

460
        ! Transpose Householder vector v_row -> v_col
461
	call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup    
Andreas Marek committed
462
463
464
             &MATH_DATATYPE&
             &_&
             &PRECISION &
465
466
467
                                   (v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
				    1, istep-1, 1, nblk)

468
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
469

470
471
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
472

473
474
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
475
        if (l_rows > 0 .and. l_cols> 0 ) then
476
          if(useGPU) then
477
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
478
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
479

480
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
481
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
482

483
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
484

Andreas Marek's avatar
Andreas Marek committed
485
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
486

487
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
488
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
489
490
491
          endif ! useGU

#if REALCASE == 1
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
!            if (useGPU) then
             !u_col_dev(1:l_cols) = 0.
             !u_row_dev(1:l_rows) = 0.
             !v_col_dev(1:l_cols) = v_col(1:l_cols)
             !v_row_dev(1:l_rows) = v_row(1:l_rows)
!                do i=0,(istep-2)/tile_size
!                  l_col_beg = i*l_cols_per_tile+1
!                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
!                  if(l_col_end<l_col_beg) cycle
!                  do j=0,i
!                     l_row_beg = j*l_rows_per_tile+1
!                     l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
!                     if(l_row_end<l_row_beg) cycle
!                     if(mod(n_iter,n_threads) == my_thread) then
!                       call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
!                       if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
!                     endif
!                     n_iter = n_iter+1
!                  enddo
!                enddo
!
             !--- for now, just use DSYMV!!!
514
             ! a_dev -> a_mat ?
Pavel Kus's avatar
Pavel Kus committed
515
             !write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
516
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
517
!                         1.d0, a_mat, ubound(a_mat,1),  &
518
!                         v_row, 1,  &
519
!                         0.d0, u_col, 1)
520
521
             !u_col(1:l_cols) = u_col_dev(1:l_cols)
             !u_row(1:l_rows) = u_row_dev(1:l_rows)
522

523
!            else !do not use GPU
524
525
#endif

Pavel Kus's avatar
Pavel Kus committed
526
#ifdef WITH_OPENMP
527
          call timer%start("OpenMP parallel")
528
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
529

530
531
          my_thread = omp_get_thread_num()
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
532

533
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
534

535
536
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
537
#endif /* WITH_OPENMP */
538
          do i= 0, (istep-2)/tile_size
539
540
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
541
542
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
543
544
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
545
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
546
#ifdef WITH_OPENMP
547
              if (mod(n_iter,n_threads) == my_thread) then
548
	        call timer%start("blas")
549
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
550
551
552
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

553
                if (i/=j) then
554
555
556
557
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
558
                endif
559
	        call timer%stop("blas")
560
561
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
562
#else /* WITH_OPENMP */
563

564
              if (useGPU) then
565
                a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_datatype
566
567
                call timer%start("cublas")
                call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
568
569
570
                                           l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
                                           ONE, a_dev + a_offset, lda,                   &
                                           v_row_dev + (l_row_beg - 1) *                 &
571
                                           size_of_datatype, 1,                          &
572
                                           ONE, u_col_dev + (l_col_beg - 1) *            &
573
                                           size_of_datatype, 1)
574

575
               if(i/=j) then
576
577
578
                  call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
                                             ONE, a_dev + a_offset, lda,                        &
                                             v_col_dev + (l_col_beg - 1) *                      &
579
                                             size_of_datatype, 1,                          &
580
                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
581
                                             size_of_datatype, 1)
582
                endif
583
                call timer%stop("cublas")
584
              else ! useGPU
585
586
587
588
589
590
                call timer%start("blas")
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                            ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                            v_row(l_row_beg), 1,                           &
                            ONE, u_col(l_col_beg), 1)
591
592

                if (i/=j) then
593

594
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
595
596
597
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
598
                endif
599
	        call timer%stop("blas")
600
              endif ! useGPU
Pavel Kus's avatar
Pavel Kus committed
601
602

#endif /* WITH_OPENMP */
603
            enddo  ! j=0,i
604
605
          enddo  ! i=0,(istep-2)/tile_size

606
          if (useGPU) then
607
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
608
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
609

610
            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
611
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
612
          endif
613

614
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
615
!                         1.d0, a_mat, ubound(a_mat,1),  &
616
!                         v_row, 1,  &
617
!                         0.d0, u_col, 1)
618
619
620

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
621
622
#ifdef WITH_OPENMP
!$OMP END PARALLEL
623
          call timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
624

625
626
627
628
629
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
630

631
          if (n_stored_vecs > 0) then
632
	    call timer%start("blas")
633
#if REALCASE == 1
634
            call PRECISION_GEMV('T',     &
635
636
#endif
#if COMPLEXCASE == 1
637
            call PRECISION_GEMV('C',     &
638
#endif
639
640
641
642
643
644
645
	                        l_rows, 2*n_stored_vecs,   &
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
646
	    call timer%stop("blas")
647
          endif
Pavel Kus's avatar
Pavel Kus committed
648

649
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
650

651
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
652
        ! on the processors containing the diagonal
653
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
654
655
656
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
657
658
659
660
661
662
663
664

	  call elpa_reduce_add_vectors_&
	  &MATH_DATATYPE&
	  &_&
	  &PRECISION &
          (u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
	   mpi_comm_cols, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
665
666
        endif

667
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
668
669

        if (l_cols>0) then
670
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
671
672
#ifdef WITH_MPI
          call timer%start("mpi_communication")
673
          call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
674
                             MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
675
676
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
677
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
678
679
680
#endif /* WITH_MPI */
        endif

681
682
683
684
685
686
687
        call elpa_transpose_vectors_&
	&MATH_DATATYPE&
	&_&
	&PRECISION &
        (u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
	 mpi_comm_rows, 1, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
688
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
689
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
690
        x = 0
691
        if (l_cols>0)  &
692
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
693
694
695
696
697
698
699
#endif
#if COMPLEXCASE == 1
        xc = 0
        if (l_cols>0)  &
           xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif

Pavel Kus's avatar
Pavel Kus committed
700
701
#ifdef WITH_MPI
        call timer%start("mpi_communication")
702
#if REALCASE == 1
703
        call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
704
705
#endif
#if COMPLEXCASE == 1
706
        call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
707
#endif
Pavel Kus's avatar
Pavel Kus committed
708
709
        call timer%stop("mpi_communication")
#else /* WITH_MPI */
710
711

#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
712
        vav = x
713
714
715
716
717
#endif
#if COMPLEXCASE == 1
        vav = xc
#endif

Pavel Kus's avatar
Pavel Kus committed
718
719
720
721
722
723
#endif /* WITH_MPI */

        ! store u and v in the matrices U and V
        ! these matrices are stored combined in one here

        do j=1,l_rows
724
#if REALCASE == 1
725
726
          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
727
728
729
730
731
#endif
#if COMPLEXCASE == 1
          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
732
733
        enddo
        do j=1,l_cols
734
#if REALCASE == 1
735
736
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
737
738
739
740
741
#endif
#if COMPLEXCASE == 1
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
742
743
        enddo

744
745
746
747
        ! We have calculated another Hauseholder vector, number of implicitly stored increased
        n_stored_vecs = n_stored_vecs+1

        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
748
        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
749
750
751

          if (useGPU) then
            successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
752
                                      max_local_rows * 2 * max_stored_uv *          &
753
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
754
            check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
755

756
            successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
757
                                      max_local_cols * 2 * max_stored_uv *          &
758
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
759
            check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
760
          endif
Pavel Kus's avatar
Pavel Kus committed
761

762
          do i = 0, (istep-2)/tile_size
763
764
765
766
767
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
            l_row_beg = 1
            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
768
              cycle
769
770

            if (useGPU) then
771
772
773
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
                                         l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
774
                                         ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
775
                                         size_of_datatype,  &
776
                                         max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
777
                                         size_of_datatype,  &
778
                                         max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) *            &
779
                                         size_of_datatype , lda)
780
              call timer%stop("cublas")
781
            else !useGPU
782
783
784
              call timer%start("blas")
              call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
                                   l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,    &
785
786
787
                                   ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1),   &
                                   uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1),        &
                                   ONE, a_mat(l_row_beg,l_col_beg), lda)
788
              call timer%stop("blas")
789
            endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
790
791
          enddo

792
          n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
793
794
795

        endif

796
        if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
797
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
798
            !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
799
             a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
800

Pavel Kus's avatar
Pavel Kus committed
801
             successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
802
                                       1 *  size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
803
             check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
804

805
          endif
806
          if (n_stored_vecs > 0) then
Pavel Kus's avatar
Pavel Kus committed
807
            a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
808
809
                        + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
          end if
Pavel Kus's avatar
Pavel Kus committed
810
          d_vec(istep-1) = a_mat(l_rows,l_cols)
811

812
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
813
            !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
814
            !successCUDA = cuda_threadsynchronize()
Andreas Marek's avatar
Andreas Marek committed
815
816
817
            !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)

             successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_size_t), &
Andreas Marek's avatar
Andreas Marek committed
818
                                       int(1 * size_of_datatype, kind=c_size_t), cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
819
             check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
820
          endif
Pavel Kus's avatar
Pavel Kus committed
821
822
823
824
        endif

      enddo ! main cycle over istep=na,3,-1

825
826
827
828
829
830
831
#if COMPLEXCASE == 1
      ! Store e_vec(1) and d_vec(1)

      if (my_pcol==pcol(2, nblk, np_cols)) then
        if (my_prow==prow(1, nblk, np_rows)) then
          ! We use last l_cols value of loop above
          if(useGPU) then
832
833
            successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
834
            check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
835
836
837
838
            vrl = aux3(1)
          else !useGPU
            vrl = a_mat(1,l_cols)
          endif !useGPU
839
840
841
842
843
844
845
846
#if REALCASE == 1
          call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
          call hh_transform_complex_&
#endif
&PRECISION &
	                     (vrl, CONST_REAL_0_0, xf, tau(2))
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
          e_vec(1) = vrl

          a_mat(1,l_cols) = 1. ! for consistency only
        endif
#ifdef WITH_MPI
        call timer%start("mpi_communication")
        call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
        call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      endif

#ifdef WITH_MPI
      call timer%start("mpi_communication")
      call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
      call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
        if(useGPU) then
          successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
868
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
869
          check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
870
871
872
873
874
875
876
877
878
          d_vec(1) = PRECISION_REAL(aux3(1))
        else !useGPU
          d_vec(1) = PRECISION_REAL(a_mat(1,1))
        endif !useGPU
      endif

#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
879
      ! Store e_vec(1)
880

881
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
882
        if(useGPU) then
883
884
          successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
885
          check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
886
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
887
          e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
888
889
        endif !useGPU
      endif
890

Pavel Kus's avatar
Pavel Kus committed
891
     ! Store d_vec(1)
892
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
893
        if(useGPU) then
894
          successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
895
          check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
896
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
897
          d_vec(1) = a_mat(1,1)
898
        endif !useGPU
899
      endif
900
#endif
Pavel Kus's avatar
Pavel Kus committed
901

902
      deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
Pavel Kus's avatar
Pavel Kus committed
903
      if (istat .ne. 0) then
904
#if REALCASE == 1
905
        print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
906
907
908
909
#endif
#if COMPLEXCASE == 1
        print *,"tridiag_complex: error when deallocating tmp "//errorMessage
#endif
Andreas Marek's avatar
Andreas Marek committed
910
        stop 1
Pavel Kus's avatar
Pavel Kus committed
911
      endif
912

913
      if (useGPU) then
914
915
        ! todo: should we leave a_mat on the device for further use?
        successCUDA = cuda_free(a_dev)
Andreas Marek's avatar
Andreas Marek committed
916
        check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
917

918
        successCUDA = cuda_free(v_row_dev)
Andreas Marek's avatar
Andreas Marek committed
919
        check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
920

921
        successCUDA = cuda_free(u_row_dev)
Andreas Marek's avatar
Andreas Marek committed
922
        check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
923

924
        successCUDA = cuda_free(v_col_dev)
Andreas Marek's avatar
Andreas Marek committed
925
        check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
926

927
        successCUDA = cuda_free(u_col_dev)
Andreas Marek's avatar
Andreas Marek committed
928
        check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
929

930
        successCUDA = cuda_free(vu_stored_rows_dev)
Andreas Marek's avatar
Andreas Marek committed
931
        check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
932

933
        successCUDA = cuda_free(uv_stored_cols_dev)
Andreas Marek's avatar
Andreas Marek committed
934
        check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
935
936
      endif

Pavel Kus's avatar
Pavel Kus committed
937
      ! distribute the arrays d_vec and e_vec to all processors
Pavel Kus's avatar
Pavel Kus committed
938

939
#if REALCASE == 1
940
      allocate(tmp(na), stat=istat, errmsg=errorMessage)
941

Pavel Kus's avatar
Pavel Kus committed
942
943
      if (istat .ne. 0) then
        print *,"tridiag_real: error when allocating tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
944
        stop 1
Pavel Kus's avatar
Pavel Kus committed
945
      endif
946
947
948
949
950
#endif
#if COMPLEXCASE == 1
      allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
951
        stop 1
952
953
954
955
      endif
#endif


Pavel Kus's avatar
Pavel Kus committed
956
957
#ifdef WITH_MPI
      call timer%start("mpi_communication")
958
#if REALCASE == 1