elpa1_tridiag_template.F90 50.6 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56
57
58
59
60
61
62
63

#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
#else
#define SAVE_MATR(name, iteration)
#endif
64
65
66
67
68

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
69
!> \param obj	      object of elpa_type
70
71
!> \param na          Order of matrix
!>
72
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
73
74
75
76
77
78
79
80
81
82
83
84
85
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
86
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
87
!>
88
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
89
90
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
91
92
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
93
!> \param wantDebug   if true more debug information
94
!>
95
96
97
98
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
99
    (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
100
101
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
102
      use precision
103
      use elpa_abstract_impl
104
      use matrix_plot
105
      use elpa_omp
106
      use elpa_blas_interfaces
107

Pavel Kus's avatar
Pavel Kus committed
108
      implicit none
Pavel Kus's avatar
Pavel Kus committed
109
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
110
      class(elpa_abstract_impl_t), intent(inout)    :: obj
111
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
112
      logical, intent(in)                           :: useGPU, wantDebug
Carolin Penke's avatar
Carolin Penke committed
113
114
      integer(kind=c_int)                           :: skewsymmetric
      logical                                       :: isSkewsymmetric
115

Andreas Marek's avatar
Andreas Marek committed
116
      MATH_DATATYPE(kind=rck), intent(out)          :: tau(na)
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
118
      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
Andreas Marek's avatar
Andreas Marek committed
120
      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,matrixCols)
121
#endif
Andreas Marek's avatar
Andreas Marek committed
122
123
      real(kind=rk), intent(out)                    :: d_vec(na)
      real(kind=rk), intent(out)                    :: e_vec(na)
124
      integer(kind=ik), parameter                   :: max_stored_uv = 32
125
      logical,          parameter                   :: mat_vec_as_one_block = .true.
Pavel Kus's avatar
Pavel Kus committed
126
127

      ! id in processor row and column and total numbers of processor rows and columns
Andreas Marek's avatar
Andreas Marek committed
128
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
129
130
      integer(kind=MPI_KIND)                        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
      integer(kind=MPI_KIND)                        :: mpierr
131
132
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
133
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
134
      ! local columns and rows of the remaining part of the matrix
135
136
137
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
138

139
140
141
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
142

143
144
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
Andreas Marek's avatar
Andreas Marek committed
145
      integer(kind=c_intptr_t)                      :: a_offset
Pavel Kus's avatar
Pavel Kus committed
146

Andreas Marek's avatar
Andreas Marek committed
147
      integer(kind=ik), intent(in)                  :: max_threads
Pavel Kus's avatar
Pavel Kus committed
148
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
149
      integer(kind=ik)                              :: my_thread, n_threads, n_iter
Pavel Kus's avatar
Pavel Kus committed
150
151
#endif

152
      real(kind=rk)                                 :: vnorm2
153
      MATH_DATATYPE(kind=rck)                       :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
154
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
155
      complex(kind=rck)                             :: aux3(1)
156
#endif
157

Andreas Marek's avatar
Andreas Marek committed
158
      integer(kind=c_intptr_t)                      :: num
Andreas Marek's avatar
Andreas Marek committed
159
      MATH_DATATYPE(kind=rck), allocatable          :: tmp(:)
160
      MATH_DATATYPE(kind=rck), pointer              :: v_row(:), & ! used to store calculated Householder Vector
161
                                                       v_col(:)   ! the same Vector, but transposed 
162
      MATH_DATATYPE(kind=rck), pointer              :: u_col(:), u_row(:)
Andreas Marek's avatar
Andreas Marek committed
163

164
      ! the following two matrices store pairs of vectors v and u calculated in each step
165
166
      ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
      ! u and v are stored both in row and Vector forms
167
168
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
169
      MATH_DATATYPE(kind=rck), pointer             :: vu_stored_rows(:,:)
170
      ! pattern: u1,v1,u2,v2,u3,v3,....
Andreas Marek's avatar
Andreas Marek committed
171
      MATH_DATATYPE(kind=rck), allocatable         :: uv_stored_cols(:,:)
172

Pavel Kus's avatar
Pavel Kus committed
173
#ifdef WITH_OPENMP
174
      MATH_DATATYPE(kind=rck), allocatable         :: ur_p(:,:), uc_p(:,:)
Pavel Kus's avatar
Pavel Kus committed
175
176
#endif

177
178
179
      type(c_ptr)                                   :: v_row_host, v_col_host
      type(c_ptr)                                   :: u_row_host, u_col_host
      type(c_ptr)                                   :: vu_stored_rows_host, uv_stored_cols_host
180
      real(kind=rk), allocatable                    :: tmp_real(:)
181
      integer(kind=ik)                              :: min_tile_size, error
182
183
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
184
      character(20)                                 :: gpuString
185
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
186
187
188
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
Carolin Penke's avatar
Carolin Penke committed
189
190
      call obj%get("is_skewsymmetric",skewsymmetric,istat)
      if (istat .ne. ELPA_OK) then
Andreas Marek's avatar
Andreas Marek committed
191
           print *,"Problem getting option for skewsymmetric settings. Aborting..."
Carolin Penke's avatar
Carolin Penke committed
192
           stop
193
      endif
Carolin Penke's avatar
Carolin Penke committed
194
      isSkewsymmetric = (skewsymmetric == 1)
195

196
197
198
199
200
201
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

202
      call obj%timer%start("tridiag_&
203
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
204
      &" // &
205
206
      PRECISION_SUFFIX // &
      gpuString )
207

208
      if (wantDebug) call obj%timer%start("mpi_communication")
209
210
211
212
213
214
215
216
217
      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)

      my_prow = int(my_prowMPI, kind=c_int)
      np_rows = int(np_rowsMPI, kind=c_int)
      my_pcol = int(my_pcolMPI, kind=c_int)
      np_cols = int(np_colsMPI, kind=c_int)
218
      if (wantDebug) call obj%timer%stop("mpi_communication")
219

Pavel Kus's avatar
Pavel Kus committed
220
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
221
222
223
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
224
      !  -----------------
225
226
227
228
229
230
231
232
233
234
235
236
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
237
      ! size of this small block is nblk
238
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
239
      ! tile_size is thus nblk * 6
240
      !
Pavel Kus's avatar
Pavel Kus committed
241
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
242
243
244
245
246
247
248

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
Andreas Marek's avatar
Andreas Marek committed
249
        print *,"Problem setting option for min_tile_size. Aborting..."
250
251
252
253
254
255
256
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
257

258
259
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
260

Pavel Kus's avatar
Pavel Kus committed
261
      totalblocks = (na-1)/nblk + 1
262
263
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
264

265
266
267
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
268

269
270
271
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
272
      ! todo: probably one should read it as v_row = Vector v distributed among rows
273
      !
Pavel Kus's avatar
Pavel Kus committed
274
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
275
276
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
277

278
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
279
280
281
282
        allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
        &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

283
284
      if (useGPU) then
        num = (max_local_rows+1) * size_of_datatype
Andreas Marek's avatar
Test    
Andreas Marek committed
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
        successCUDA = cuda_malloc_host(v_row_host,num)
        check_alloc_cuda("tridiag: v_row_host", successCUDA)

        call c_f_pointer(v_row_host,v_row,(/num/))

        num = (max_local_cols) * size_of_datatype
        successCUDA = cuda_malloc_host(v_col_host,num)
        check_alloc_cuda("tridiag: v_col_host", successCUDA)

        call c_f_pointer(v_col_host,v_col,(/num/))

        num = (max_local_cols) * size_of_datatype
        successCUDA = cuda_malloc_host(u_col_host,num)
        check_alloc_cuda("tridiag: u_col_host", successCUDA)

        call c_f_pointer(u_col_host,u_col,(/num/))

        num = (max_local_rows) * size_of_datatype
        successCUDA = cuda_malloc_host(u_row_host,num)
        check_alloc_cuda("tridiag: u_row_host", successCUDA)

        call c_f_pointer(u_row_host,u_row,(/num/))

        num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
        successCUDA = cuda_malloc_host(vu_stored_rows_host,num)
        check_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA)

        call c_f_pointer(vu_stored_rows_host,vu_stored_rows,(/max_local_rows,2*max_stored_uv/))

        num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
        !successCUDA = cuda_malloc_host(uv_stored_cols_host,num)
        !check_alloc_cuda("tridiag: uv_stored_cols_host", successCUDA)

        !call c_f_pointer(uv_stored_cols_host,uv_stored_cols,(/max_local_cols,2*max_stored_uv/))

        successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)

#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
        num = na * 8
#else
        num = na * 4
#endif
        successCUDA = cuda_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("tridiag: e_vec", successCUDA)

#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
        num = na * 8
#else
        num = na * 4
#endif
        successCUDA = cuda_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("tridiag: d_vec", successCUDA)

      else
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
        allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
        &MATH_DATATYPE ", "v_row", istat, errorMessage)

        allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
         &MATH_DATATYPE ", "v_col", istat, errorMessage)

        allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
        &MATH_DATATYPE ", "u_col", istat, errorMessage)

        allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
        &MATH_DATATYPE ", "u_row", istat, errorMessage)

        allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
        call check_alloc("tridiag_&
        &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
        
363
364
      endif

Pavel Kus's avatar
Pavel Kus committed
365
366
#ifdef WITH_OPENMP
      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
367
368
369
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
370
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
371
372
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
373
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
374
375

      tmp = 0
376
377
378
379
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
380

381
      if (useGPU) then
382
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
383
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
384

385
386
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
387
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
388

389
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
390
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
391

392
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
393
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
394

395
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
396
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
397

398
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
399
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
400
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
401

402

Pavel Kus's avatar
Pavel Kus committed
403
404
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
405
406
      tau(:) = 0

407
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
408

Pavel Kus's avatar
Pavel Kus committed
409
410
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
411

412
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
413
414
415
416
#if COMPLEXCASE == 1
        d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
#endif
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
417
        d_vec(na) = a_mat(l_rows,l_cols)
418
#endif
419

420
421
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
422

Andreas Marek's avatar
Andreas Marek committed
423
424
425
        num = lda * matrixCols * size_of_datatype

        successCUDA = cuda_malloc(a_dev, num)
Andreas Marek's avatar
Andreas Marek committed
426
        check_alloc_cuda("tridiag: a_dev", successCUDA)
427

Andreas Marek's avatar
Test    
Andreas Marek committed
428
429
430
431
        successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t),num,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("tridiag: a_mat", successCUDA)

Andreas Marek's avatar
Andreas Marek committed
432
        successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
Andreas Marek's avatar
Andreas Marek committed
433
                                  num, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
434
        check_memcpy_cuda("tridiag: a_dev", successCUDA)
435
      endif
Pavel Kus's avatar
Pavel Kus committed
436

437
      ! main cycle of tridiagonalization
438
      ! in each step, 1 Householder Vector is calculated
439
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
440

441
442
443
444
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
445

446
        ! Calculate Vector for Householder transformation on all procs
447
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
448

449
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
450

451
          ! Get Vector to be transformed; distribute last element and norm of
452
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
453

454
455
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
456
            a_offset = l_cols * lda * size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
457
458
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, 
            ! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
459

460
            successCUDA = cuda_memcpy(int(loc(v_row),kind=c_intptr_t), &
Andreas Marek's avatar
Andreas Marek committed
461
                                      a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
462
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
463
464
465
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
466

467
468
          if (n_stored_vecs > 0 .and. l_rows > 0) then
            if (wantDebug) call obj%timer%start("blas")
469
#if COMPLEXCASE == 1
470
            aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
471
#endif
472
            call PRECISION_GEMV('N',   &
Andreas Marek's avatar
Andreas Marek committed
473
474
                                  int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
                                  ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
475
#if REALCASE == 1
476
477
                                  uv_stored_cols(l_cols+1,1), &
                                          int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
478
479
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
480
                                   aux, 1_BLAS_KIND,  &
481

482
#endif
Andreas Marek's avatar
Andreas Marek committed
483
                                  ONE, v_row, 1_BLAS_KIND)
484
            if (wantDebug) call obj%timer%stop("blas")
485

486
          endif
487

488
489
490
491
492
493
494
          if (my_prow == prow(istep-1, nblk, np_rows)) then
            aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
            aux1(2) = v_row(l_rows)
          else
            aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
            aux1(2) = 0.
          endif
Pavel Kus's avatar
Pavel Kus committed
495
496

#ifdef WITH_MPI
497
498
          if (wantDebug) call obj%timer%start("mpi_communication")
          call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
499
                               MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
500
          if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
501
#else /* WITH_MPI */
502
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
503
#endif /* WITH_MPI */
504
505

#if REALCASE == 1
506
          vnorm2 = aux2(1)
507
508
509
510
#endif
#if COMPLEXCASE == 1
          vnorm2 = real(aux2(1),kind=rk)
#endif
511
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
512

513
          ! Householder transformation
514
#if REALCASE == 1
515
          call hh_transform_real_&
516
517
#endif
#if COMPLEXCASE == 1
518
          call hh_transform_complex_&
519
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
520
               &PRECISION &
521
                             (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
522
          ! Scale v_row and store Householder Vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
523

524
          v_row(1:l_rows) = v_row(1:l_rows) * xf
525
          if (my_prow == prow(istep-1, nblk, np_rows)) then
526
527
528
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
529
#if REALCASE == 1
530
            e_vec(istep-1) = vrl
531
532
533
534
#endif
#if COMPLEXCASE == 1
            e_vec(istep-1) = real(vrl,kind=rk)
#endif
535
536
          endif

537
          ! store Householder Vector for back transformation
538
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
539

540
          ! add tau after the end of actuall v_row, to be broadcasted with it
541
          v_row(l_rows+1) = tau(istep)
542
         endif !(my_pcol == pcol(istep, nblk, np_cols))
543

544
!          SAVE_MATR("HH vec stored", na - istep + 1)
Pavel Kus's avatar
Pavel Kus committed
545
546

#ifdef WITH_MPI
547
         if (wantDebug) call obj%timer%start("mpi_communication")
548
         ! Broadcast the Householder Vector (and tau) along columns
549
550
         call MPI_Bcast(v_row, int(l_rows+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                        int(pcol(istep, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
551
         if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
552
553
#endif /* WITH_MPI */

554
555
         !recover tau, which has been broadcasted together with v_row
         tau(istep) =  v_row(l_rows+1)
556

557
558
         ! Transpose Householder Vector v_row -> v_col
         call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup    
Andreas Marek committed
559
560
561
             &MATH_DATATYPE&
             &_&
             &PRECISION &
562
                   (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
Andreas Marek's avatar
Andreas Marek committed
563
                    1, istep-1, 1, nblk, max_threads)
564

565
         ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
566

567
568
         ! For cache efficiency, we use only the upper half of the matrix tiles for this,
         ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
569

570
571
572
573
         u_col(1:l_cols) = 0
         u_row(1:l_rows) = 0
         if (l_rows > 0 .and. l_cols> 0 ) then
          if (useGPU) then
574
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
575
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
576

577
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
578
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
579

Andreas Marek's avatar
Andreas Marek committed
580
581
            successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
                          l_cols * size_of_datatype, cudaMemcpyHostToDevice)
582

Andreas Marek's avatar
Andreas Marek committed
583
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
584

Andreas Marek's avatar
Andreas Marek committed
585
586
            successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
                                      l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
587
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
588
589
          endif ! useGU

Pavel Kus's avatar
Pavel Kus committed
590
#ifdef WITH_OPENMP
591
          call obj%timer%start("OpenMP parallel")
592
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
593

594
          my_thread = omp_get_thread_num()
Andreas Marek's avatar
Andreas Marek committed
595
          
596
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
597

598
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
599

600
          ! first calculate A*v part of (A + VU**T + UV**T)*v
601
602
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
603
#endif /* WITH_OPENMP */
604
          do i= 0, (istep-2)/tile_size
605
606
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
607
608
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
609
610
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
611
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
612
#ifdef WITH_OPENMP
613
              if (mod(n_iter,n_threads) == my_thread) then
614
                if (wantDebug) call obj%timer%start("blas")
615
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
Andreas Marek's avatar
Andreas Marek committed
616
617
                                    int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                                    ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),         &
618
                                    v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
619

620
                if (i/=j) then
Carolin Penke's avatar
Carolin Penke committed
621
                  if (isSkewsymmetric) then
622
623
                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
                                        int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
624
625
                                        -ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
                                        v_col(l_col_beg:max_local_cols), 1_BLAS_KIND,  &
626
627
                                        ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)

Carolin Penke's avatar
Carolin Penke committed
628
                  else
629
630
                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
                                        int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
631
632
                                        ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
                                        v_col(l_col_beg:max_local_cols), 1_BLAS_KIND,  &
633
                                        ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
Carolin Penke's avatar
Carolin Penke committed
634
                  endif
635
                endif
636
                if (wantDebug) call obj%timer%stop("blas")
637
638
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
639
#else /* WITH_OPENMP */
640

641
642
              ! multiplication by blocks is efficient only for CPU
              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
643
              ! CPU implementation) or by one large matrix Vector multiply
644
              if (.not. useGPU) then
645
                if (wantDebug) call obj%timer%start("blas")
646
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
Andreas Marek's avatar
Andreas Marek committed
647
648
                            int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                            ONE, a_mat(l_row_beg, l_col_beg), int(lda,kind=BLAS_KIND),         &
649
650
                            v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND,                           &
                            ONE, u_col(l_col_beg:max_local_cols), 1_BLAS_KIND)
651
652

                if (i/=j) then
Carolin Penke's avatar
Carolin Penke committed
653
                  if (isSkewsymmetric) then
654
655
                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND),  &
                                        -ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),               &
656
657
                                        v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
                                        1_BLAS_KIND)
658

Carolin Penke's avatar
Carolin Penke committed
659
                  else
660
661
                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND),  &
                                        ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),               &
662
663
                                        v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
                                        1_BLAS_KIND)
Carolin Penke's avatar
Carolin Penke committed
664
                  endif
665
                endif
666
                if (wantDebug) call obj%timer%stop("blas")
667
              endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
668
669

#endif /* WITH_OPENMP */
670
            enddo  ! j=0,i
671
672
          enddo  ! i=0,(istep-2)/tile_size

673
          if (useGPU) then
674
675
676
677
            if(mat_vec_as_one_block) then
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
678
              if (wantDebug) call obj%timer%start("cublas")
679
680
681
682
683
684
685
686
687
688
689
690
691
692
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
                                        ONE, a_dev, lda,                   &
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
!                                             ONE, a_dev + a_offset, lda,                        &
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
693
              if (wantDebug) call obj%timer%stop("cublas")
694

Andreas Marek's avatar
Andreas Marek committed
695
696
            else
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
697
698
699
700
701
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
702

703
704
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
Andreas Marek's avatar
Andreas Marek committed
705

706
707
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
708

709
710
711
712
713
714
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                              ONE, a_dev + a_offset, lda,  &
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
Andreas Marek's avatar
Andreas Marek committed
715

716
717
718
719
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
720

721
722
723
724
725
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Carolin Penke's avatar
Carolin Penke committed
726
727
728
729
730
731
732
733
734
735
736
                  if (isSkewsymmetric) then
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                                 -ONE, a_dev + a_offset, lda, &
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  else
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                                 ONE, a_dev + a_offset, lda, &
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  endif
Andreas Marek's avatar
Andreas Marek committed
737
738
739
              enddo
            end if !multiplication as one block / per stripes

Andreas Marek's avatar
Andreas Marek committed
740
741
            successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
                          u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
742
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
743

Andreas Marek's avatar
Andreas Marek committed
744
745
            successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
                          u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
746
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
747
          endif
748

749
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
750
!                         1.d0, a_mat, ubound(a_mat,1),  &
751
!                         v_row, 1,  &
752
!                         0.d0, u_col, 1)
753
754
755

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
756
757
#ifdef WITH_OPENMP
!$OMP END PARALLEL
758
          call obj%timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
759

760
761
762
763
764
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
765

Andreas Marek's avatar
Andreas Marek committed
766
          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
767
          if (n_stored_vecs > 0) then
768
            if (wantDebug) call obj%timer%start("blas")
769
#if REALCASE == 1
770
            call PRECISION_GEMV('T',     &
771
772
#endif
#if COMPLEXCASE == 1
773
            call PRECISION_GEMV('C',     &
774
#endif
Andreas Marek's avatar
Andreas Marek committed
775
776
777
                                int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                                ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
                                v_row,  1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND)
778

Andreas Marek's avatar
Andreas Marek committed
779
780
781
            call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                                ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),   &
                                aux, 1_BLAS_KIND, ONE, u_col,  1_BLAS_KIND)
782
            if (wantDebug) call obj%timer%stop("blas")
783
          endif
Pavel Kus's avatar
Pavel Kus committed
784

785
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
786

787
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
788
        ! on the processors containing the diagonal
789
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
790
791
792
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
793

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
794
795
796
797
          call elpa_reduce_add_vectors_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
798
          (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
Andreas Marek's avatar
Andreas Marek committed
799
           mpi_comm_cols, istep-1, 1, nblk, max_threads)
800

Pavel Kus's avatar
Pavel Kus committed
801
802
        endif

803
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
804
805

        if (l_cols>0) then
806
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
807
#ifdef WITH_MPI
808
          if (wantDebug) call obj%timer%start("mpi_communication")
809
810
          call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                             MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
811
          if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
812
#else /* WITH_MPI */
813
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
814
815
#endif /* WITH_MPI */
        endif
Carolin Penke's avatar
Carolin Penke committed
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
        if (isSkewsymmetric) then
           call elpa_transpose_vectors_ss_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
        else
           call elpa_transpose_vectors_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
        endif
831

Pavel Kus's avatar
Pavel Kus committed
832
833
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
        x = 0
834
        if (l_cols>0)  &
835
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
836

Pavel Kus's avatar
Pavel Kus committed
837
#ifdef WITH_MPI
838
        if (wantDebug) call obj%timer%start("mpi_communication")