elpa1_tridiag_template.F90 48.6 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56
57
58
59

#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
60
call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_pcol, np_rows, np_cols, name, iteration)
61
62
63
#else
#define SAVE_MATR(name, iteration)
#endif
64
65
66
67
68

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
69
!> \param obj	      object of elpa_type
70
71
!> \param na          Order of matrix
!>
72
!> \param a_mat(matrixRows,matrixCols)    Distributed matrix which should be reduced.
73
74
75
76
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
77
!> \param matrixRows         Leading dimension of a
78
79
80
81
82
83
84
85
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
86
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
87
!>
88
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
89
90
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
91
92
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
93
!> \param wantDebug   if true more debug information
94
!>
Andreas Marek's avatar
Andreas Marek committed
95
96
97
98
99
100
subroutine tridiag_&
  &MATH_DATATYPE&
  &_&
  &PRECISION &
  (obj, na, a_mat, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
  use cuda_functions
101
  use, intrinsic :: iso_c_binding
Andreas Marek's avatar
Andreas Marek committed
102
103
104
105
106
107
108
  use precision
  use elpa_abstract_impl
  use matrix_plot
  use elpa_omp
  use elpa_blas_interfaces

  implicit none
Pavel Kus's avatar
Pavel Kus committed
109
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
110
111
112
113
114
  class(elpa_abstract_impl_t), intent(inout)    :: obj
  integer(kind=ik), intent(in)                  :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
  logical, intent(in)                           :: useGPU, wantDebug
  integer(kind=c_int)                           :: skewsymmetric
  logical                                       :: isSkewsymmetric
115

Andreas Marek's avatar
Andreas Marek committed
116
  MATH_DATATYPE(kind=rck), intent(out)          :: tau(na)
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
118
  MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(matrixRows,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
Andreas Marek's avatar
Andreas Marek committed
120
  MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(matrixRows,matrixCols)
121
#endif
Andreas Marek's avatar
Andreas Marek committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  real(kind=rk), intent(out)                    :: d_vec(na)
  real(kind=rk), intent(out)                    :: e_vec(na)
  integer(kind=ik), parameter                   :: max_stored_uv = 32
  logical,          parameter                   :: mat_vec_as_one_block = .true.

  ! id in processor row and column and total numbers of processor rows and columns
  integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
  integer(kind=MPI_KIND)                        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
  integer(kind=MPI_KIND)                        :: mpierr
  integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                   max_local_cols
  ! updated after each istep (in the main cycle) to contain number of
  ! local columns and rows of the remaining part of the matrix
  !integer(kind=ik)                             :: l_cols, l_rows
  integer(kind=ik)                              :: l_cols, l_rows
  integer(kind=ik)                              :: n_stored_vecs

  integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                   uv_stored_cols_dev
  logical                                       :: successCUDA

  integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
  integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
  integer(kind=c_intptr_t)                      :: a_offset

  integer(kind=ik), intent(in)                  :: max_threads
148
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
149
  integer(kind=ik)                              :: my_thread, n_threads, n_iter
Pavel Kus's avatar
Pavel Kus committed
150
151
#endif

Andreas Marek's avatar
Andreas Marek committed
152
153
  real(kind=rk)                                 :: vnorm2
  MATH_DATATYPE(kind=rck)                       :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
154
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
155
  complex(kind=rck)                             :: aux3(1)
156
#endif
157

Andreas Marek's avatar
Andreas Marek committed
158
159
160
161
162
163
164
165
166
167
168
169
170
171
  integer(kind=c_intptr_t)                      :: num
  MATH_DATATYPE(kind=rck), allocatable          :: tmp(:)
  MATH_DATATYPE(kind=rck), pointer              :: v_row(:), & ! used to store calculated Householder Vector
                                                   v_col(:)   ! the same Vector, but transposed 
  MATH_DATATYPE(kind=rck), pointer              :: u_col(:), u_row(:)

  ! the following two matrices store pairs of vectors v and u calculated in each step
  ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
  ! u and v are stored both in row and Vector forms
  ! pattern: v1,u1,v2,u2,v3,u3,....
  ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
  MATH_DATATYPE(kind=rck), pointer             :: vu_stored_rows(:,:)
  ! pattern: u1,v1,u2,v2,u3,v3,....
  MATH_DATATYPE(kind=rck), allocatable         :: uv_stored_cols(:,:)
172

173
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
174
  MATH_DATATYPE(kind=rck), allocatable         :: ur_p(:,:), uc_p(:,:)
Pavel Kus's avatar
Pavel Kus committed
175
176
#endif

Andreas Marek's avatar
Andreas Marek committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
  type(c_ptr)                                   :: v_row_host, v_col_host
  type(c_ptr)                                   :: u_row_host, u_col_host
  type(c_ptr)                                   :: vu_stored_rows_host, uv_stored_cols_host
  real(kind=rk), allocatable                    :: tmp_real(:)
  integer(kind=ik)                              :: min_tile_size, error
  integer(kind=ik)                              :: istat
  character(200)                                :: errorMessage
  character(20)                                 :: gpuString
  integer(kind=ik)                              :: nblockEnd
  integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
                                                                      &PRECISION&
                                                                      &_&
                                                                      &MATH_DATATYPE
  call obj%get("is_skewsymmetric",skewsymmetric,istat)
  if (istat .ne. ELPA_OK) then
       print *,"Problem getting option for skewsymmetric settings. Aborting..."
       stop
  endif
  isSkewsymmetric = (skewsymmetric == 1)

  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif

  call obj%timer%start("tridiag_&
  &MATH_DATATYPE&
  &" // &
  PRECISION_SUFFIX // &
  gpuString )

  if (wantDebug) call obj%timer%start("mpi_communication")
  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
  call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
  call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)

  my_prow = int(my_prowMPI, kind=c_int)
  np_rows = int(np_rowsMPI, kind=c_int)
  my_pcol = int(my_pcolMPI, kind=c_int)
  np_cols = int(np_colsMPI, kind=c_int)
  if (wantDebug) call obj%timer%stop("mpi_communication")

  ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
  ! seems that tile is a square submatrix, consisting by several blocks
  ! it is a smallest possible square submatrix, where blocks being distributed among
  ! processors are "aligned" in both rows and columns
  !  -----------------
  ! | 1 4 | 1 4 | 1 4 | ...
  ! | 2 5 | 2 5 | 2 5 | ...
  ! | 3 6 | 3 6 | 3 6 | ...
  !  ----------------- ...
  ! | 1 4 | 1 4 | 1 4 | ...
  ! | 2 5 | 2 5 | 2 5 | ...
  ! | 3 6 | 3 6 | 3 6 | ...
  !  ----------------- .
  !   : :   : :   : :    .
  !   : :   : :   : :      .
  !
  ! this is a tile, where each number represents block, assigned to a processor with the shown number
  ! size of this small block is nblk
  ! Image is for situation with 6 processors, 3 processor rows and 2 columns
  ! tile_size is thus nblk * 6
  !
  tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size

  ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
  ! larger or equal to min_tile_size
  ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
  ! it can, however, be set by the user
  call obj%get("min_tile_size", min_tile_size ,error)
  if (error .ne. ELPA_OK) then
    print *,"Problem setting option for min_tile_size. Aborting..."
    stop
  endif
  if(min_tile_size == 0) then
    ! not set by the user, use the default value
    min_tile_size = 128*max(np_rows, np_cols)
  endif
  tile_size = ((min_tile_size-1)/tile_size+1)*tile_size

  nblockEnd = 3

  l_rows_per_tile = tile_size/np_rows ! local rows of a tile
  l_cols_per_tile = tile_size/np_cols ! local cols of a tile

  totalblocks = (na-1)/nblk + 1
  max_loc_block_rows = (totalblocks-1)/np_rows + 1
  max_loc_block_cols = (totalblocks-1)/np_cols + 1

  ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
  max_local_rows = max_loc_block_rows*nblk
  max_local_cols = max_loc_block_cols*nblk

  ! allocate memmory for vectors
  ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
  ! todo: if something has length max_local_rows, it is actually a column, no?
  ! todo: probably one should read it as v_row = Vector v distributed among rows
  !
  allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
  call check_alloc("tridiag_&
  &MATH_DATATYPE ", "tmp", istat, errorMessage)

  ! allocate v_row 1 element longer to allow store and broadcast tau together with it
  allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
  call check_alloc("tridiag_&
       &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

  allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
  call check_alloc("tridiag_&
       &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

  if (useGPU) then
    num = (max_local_rows+1) * size_of_datatype
    successCUDA = cuda_malloc_host(v_row_host,num)
    check_host_alloc_cuda("tridiag: v_row_host", successCUDA)
    call c_f_pointer(v_row_host,v_row,(/(max_local_rows+1)/))

    num = (max_local_cols) * size_of_datatype
    successCUDA = cuda_malloc_host(v_col_host,num)
    check_host_alloc_cuda("tridiag: v_col_host", successCUDA)
    call c_f_pointer(v_col_host,v_col,(/(max_local_cols)/))

    num = (max_local_cols) * size_of_datatype
    successCUDA = cuda_malloc_host(u_col_host,num)
    check_host_alloc_cuda("tridiag: u_col_host", successCUDA)
    call c_f_pointer(u_col_host,u_col,(/(max_local_cols)/))

    num = (max_local_rows) * size_of_datatype
    successCUDA = cuda_malloc_host(u_row_host,num)
    check_host_alloc_cuda("tridiag: u_row_host", successCUDA)
    call c_f_pointer(u_row_host,u_row,(/(max_local_rows)/))

    num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
    successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
                  cudaHostRegisterDefault)
    check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)

    num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
    successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
                  cudaHostRegisterDefault)
    check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
Andreas Marek's avatar
Test    
Andreas Marek committed
320
321

#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
Andreas Marek's avatar
Andreas Marek committed
322
    num = na * 8
Andreas Marek's avatar
Test    
Andreas Marek committed
323
#else
Andreas Marek's avatar
Andreas Marek committed
324
    num = na * 4
Andreas Marek's avatar
Test    
Andreas Marek committed
325
#endif
Andreas Marek's avatar
Andreas Marek committed
326
    successCUDA = cuda_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
Andreas Marek's avatar
Test    
Andreas Marek committed
327
                      cudaHostRegisterDefault)
Andreas Marek's avatar
Andreas Marek committed
328
    check_host_register_cuda("tridiag: e_vec", successCUDA)
Andreas Marek's avatar
Test    
Andreas Marek committed
329
330

#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
Andreas Marek's avatar
Andreas Marek committed
331
    num = na * 8
Andreas Marek's avatar
Test    
Andreas Marek committed
332
#else
Andreas Marek's avatar
Andreas Marek committed
333
    num = na * 4
Andreas Marek's avatar
Test    
Andreas Marek committed
334
#endif
Andreas Marek's avatar
Andreas Marek committed
335
    successCUDA = cuda_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
Andreas Marek's avatar
Test    
Andreas Marek committed
336
                      cudaHostRegisterDefault)
Andreas Marek's avatar
Andreas Marek committed
337
    check_host_register_cuda("tridiag: d_vec", successCUDA)
Andreas Marek's avatar
Test    
Andreas Marek committed
338

Andreas Marek's avatar
Andreas Marek committed
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  else
    allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
    call check_alloc("tridiag_&
    &MATH_DATATYPE ", "v_row", istat, errorMessage)

    allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
    call check_alloc("tridiag_&
     &MATH_DATATYPE ", "v_col", istat, errorMessage)

    allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
    call check_alloc("tridiag_&
    &MATH_DATATYPE ", "u_col", istat, errorMessage)

    allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
    call check_alloc("tridiag_&
    &MATH_DATATYPE ", "u_row", istat, errorMessage)
      
  endif
357

358
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
359
360
361
  allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
  call check_alloc("tridiag_&
  &MATH_DATATYPE ", "ur_p", istat, errorMessage)
362

Andreas Marek's avatar
Andreas Marek committed
363
364
365
  allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
  call check_alloc("tridiag_&
  &MATH_DATATYPE ", "uc_p", istat, errorMessage)
366
#endif /* WITH_OPENMP_TRADITIONAL */
Pavel Kus's avatar
Pavel Kus committed
367

Andreas Marek's avatar
Andreas Marek committed
368
369
370
371
372
  tmp = 0
  v_row = 0
  u_row = 0
  v_col = 0
  u_col = 0
Pavel Kus's avatar
Pavel Kus committed
373

Andreas Marek's avatar
Andreas Marek committed
374
375
376
  if (useGPU) then
     successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
     check_alloc_cuda("tridiag: v_row_dev", successCUDA)
377

Andreas Marek's avatar
Andreas Marek committed
378
     successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
379

Andreas Marek's avatar
Andreas Marek committed
380
     check_alloc_cuda("tridiag: u_row_dev", successCUDA)
381

Andreas Marek's avatar
Andreas Marek committed
382
383
     successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
     check_alloc_cuda("tridiag: v_col_dev", successCUDA)
384

Andreas Marek's avatar
Andreas Marek committed
385
386
     successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
     check_alloc_cuda("tridiag: u_col_dev", successCUDA)
387

Andreas Marek's avatar
Andreas Marek committed
388
389
     successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
     check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
390

Andreas Marek's avatar
Andreas Marek committed
391
392
393
     successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
     check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
  endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
394

395

Andreas Marek's avatar
Andreas Marek committed
396
397
398
  d_vec(:) = 0
  e_vec(:) = 0
  tau(:) = 0
Pavel Kus's avatar
Pavel Kus committed
399

Andreas Marek's avatar
Andreas Marek committed
400
  n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
401

Andreas Marek's avatar
Andreas Marek committed
402
403
  l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
  l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
404

Andreas Marek's avatar
Andreas Marek committed
405
  if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
406
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
407
  d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
408
409
#endif
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
410
  d_vec(na) = a_mat(l_rows,l_cols)
411
#endif
412

Andreas Marek's avatar
Andreas Marek committed
413
414
  if (useGPU) then
    ! allocate memmory for matrix A on the device and than copy the matrix
415

Andreas Marek's avatar
Andreas Marek committed
416
    num = matrixRows * matrixCols * size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
417

Andreas Marek's avatar
Andreas Marek committed
418
419
    successCUDA = cuda_malloc(a_dev, num)
    check_alloc_cuda("tridiag: a_dev", successCUDA)
420

Andreas Marek's avatar
Andreas Marek committed
421
422
423
    successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t),num,&
                  cudaHostRegisterDefault)
    check_host_register_cuda("tridiag: a_mat", successCUDA)
Andreas Marek's avatar
Test    
Andreas Marek committed
424

Andreas Marek's avatar
Andreas Marek committed
425
426
427
428
    successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
                              num, cudaMemcpyHostToDevice)
    check_memcpy_cuda("tridiag: a_dev", successCUDA)
  endif
Pavel Kus's avatar
Pavel Kus committed
429

Andreas Marek's avatar
Andreas Marek committed
430
431
432
  ! main cycle of tridiagonalization
  ! in each step, 1 Householder Vector is calculated
  do istep = na, nblockEnd ,-1
Pavel Kus's avatar
Pavel Kus committed
433

Andreas Marek's avatar
Andreas Marek committed
434
435
436
437
    ! Calculate number of local rows and columns of the still remaining matrix
    ! on the local processor
    l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
    l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
438

Andreas Marek's avatar
Andreas Marek committed
439
440
    ! Calculate Vector for Householder transformation on all procs
    ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
441

Andreas Marek's avatar
Andreas Marek committed
442
    if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
443

Andreas Marek's avatar
Andreas Marek committed
444
445
      ! Get Vector to be transformed; distribute last element and norm of
      ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
446

Andreas Marek's avatar
Andreas Marek committed
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
      ! copy l_cols + 1 column of A to v_row
      if (useGPU) then
        a_offset = l_cols * matrixRows * size_of_datatype
        ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, 
        ! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)

        successCUDA = cuda_memcpy(int(loc(v_row),kind=c_intptr_t), &
                                  a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
        check_memcpy_cuda("tridiag a_dev 1", successCUDA)
      else
        v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
      endif

      if (n_stored_vecs > 0 .and. l_rows > 0) then
        if (wantDebug) call obj%timer%start("blas")
462
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
463
        aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
464
#endif
Andreas Marek's avatar
Andreas Marek committed
465
466
467
        call PRECISION_GEMV('N',   &
                            int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
                            ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
468
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
469
470
                            uv_stored_cols(l_cols+1,1), &
                            int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
471
472
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
473
                            aux, 1_BLAS_KIND,  &
474
#endif
Andreas Marek's avatar
Andreas Marek committed
475
476
                            ONE, v_row, 1_BLAS_KIND)
        if (wantDebug) call obj%timer%stop("blas")
477

Andreas Marek's avatar
Andreas Marek committed
478
      endif
479

Andreas Marek's avatar
Andreas Marek committed
480
481
482
483
484
485
486
      if (my_prow == prow(istep-1, nblk, np_rows)) then
        aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
        aux1(2) = v_row(l_rows)
      else
        aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
        aux1(2) = 0.
      endif
Pavel Kus's avatar
Pavel Kus committed
487
488

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
489
490
491
492
      if (wantDebug) call obj%timer%start("mpi_communication")
      call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
                           MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
      if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
493
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
494
      aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
495
#endif /* WITH_MPI */
496
497

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
498
      vnorm2 = aux2(1)
499
500
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
501
      vnorm2 = real(aux2(1),kind=rk)
502
#endif
Andreas Marek's avatar
Andreas Marek committed
503
      vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
504

Andreas Marek's avatar
Andreas Marek committed
505
      ! Householder transformation
506
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
507
      call hh_transform_real_&
508
509
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
510
      call hh_transform_complex_&
511
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
512
               &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
513
514
               (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
      ! Scale v_row and store Householder Vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
515

Andreas Marek's avatar
Andreas Marek committed
516
517
518
      v_row(1:l_rows) = v_row(1:l_rows) * xf
      if (my_prow == prow(istep-1, nblk, np_rows)) then
        v_row(l_rows) = 1.
519

Andreas Marek's avatar
Andreas Marek committed
520
        ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
521
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
522
        e_vec(istep-1) = vrl
523
524
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
525
        e_vec(istep-1) = real(vrl,kind=rk)
526
#endif
Andreas Marek's avatar
Andreas Marek committed
527
      endif
528

Andreas Marek's avatar
Andreas Marek committed
529
530
      ! store Householder Vector for back transformation
      a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
531

Andreas Marek's avatar
Andreas Marek committed
532
533
534
      ! add tau after the end of actuall v_row, to be broadcasted with it
      v_row(l_rows+1) = tau(istep)
    endif !(my_pcol == pcol(istep, nblk, np_cols))
535

536
!          SAVE_MATR("HH vec stored", na - istep + 1)
Pavel Kus's avatar
Pavel Kus committed
537
538

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
539
540
541
542
543
    if (wantDebug) call obj%timer%start("mpi_communication")
    ! Broadcast the Householder Vector (and tau) along columns
    call MPI_Bcast(v_row, int(l_rows+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                   int(pcol(istep, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
    if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
544
545
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
546
547
    !recover tau, which has been broadcasted together with v_row
    tau(istep) =  v_row(l_rows+1)
548

Andreas Marek's avatar
Andreas Marek committed
549
550
551
552
553
554
555
    ! Transpose Householder Vector v_row -> v_col
    call elpa_transpose_vectors_&
        &MATH_DATATYPE&
        &_&
        &PRECISION &
              (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
               1, istep-1, 1, nblk, max_threads)
556

Andreas Marek's avatar
Andreas Marek committed
557
    ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
558

Andreas Marek's avatar
Andreas Marek committed
559
560
    ! For cache efficiency, we use only the upper half of the matrix tiles for this,
    ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
561

Andreas Marek's avatar
Andreas Marek committed
562
563
564
565
566
567
    u_col(1:l_cols) = 0
    u_row(1:l_rows) = 0
    if (l_rows > 0 .and. l_cols> 0 ) then
     if (useGPU) then
       successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
       check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
568

Andreas Marek's avatar
Andreas Marek committed
569
570
       successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
       check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
571

Andreas Marek's avatar
Andreas Marek committed
572
573
       successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
                     l_cols * size_of_datatype, cudaMemcpyHostToDevice)
574

Andreas Marek's avatar
Andreas Marek committed
575
       check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
576

Andreas Marek's avatar
Andreas Marek committed
577
578
579
580
       successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
                                 l_rows * size_of_datatype, cudaMemcpyHostToDevice)
       check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
     endif ! useGU
581

582
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
583
     call obj%timer%start("OpenMP parallel")
584
585
586
587
588
589
590
591
592
593
594
!todo : check whether GPU implementation with large matrix multiply is beneficial
!       for a larger number of threads; could be addressed with autotuning if this
!       is the case
!$omp parallel &
!$omp num_threads(max_threads) &
!$omp default(none) &
!$omp private(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) &
!$omp shared(useGPU, isSkewsymmetric, cudaMemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp &      v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp&       max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp&       matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat)
Andreas Marek's avatar
Andreas Marek committed
595
     my_thread = omp_get_thread_num()
Andreas Marek's avatar
Andreas Marek committed
596
          
Andreas Marek's avatar
Andreas Marek committed
597
     n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
598

Andreas Marek's avatar
Andreas Marek committed
599
     n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
600

Andreas Marek's avatar
Andreas Marek committed
601
602
603
     ! first calculate A*v part of (A + VU**T + UV**T)*v
     uc_p(1:l_cols,my_thread) = 0.
     ur_p(1:l_rows,my_thread) = 0.
604
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
605
606
607
608
609
610
611
612
     do i= 0, (istep-2)/tile_size
       l_col_beg = i*l_cols_per_tile+1
       l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
       if (l_col_end < l_col_beg) cycle
       do j = 0, i
         l_row_beg = j*l_rows_per_tile+1
         l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
         if (l_row_end < l_row_beg) cycle
613
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
         if (mod(n_iter,n_threads) == my_thread) then
           if (wantDebug) call obj%timer%start("blas")
           call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),         &
                v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)

           if (i/=j) then
             if (isSkewsymmetric) then
               call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
                                   int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                                   -ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
                                   v_col(l_col_beg:max_local_cols), 1_BLAS_KIND,  &
                                   ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)

             else
               call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
                                   int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                                   ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
                                   v_col(l_col_beg:max_local_cols), 1_BLAS_KIND,  &
                                   ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
             endif
           endif
           if (wantDebug) call obj%timer%stop("blas")
         endif
         n_iter = n_iter+1
640
#else /* WITH_OPENMP_TRADITIONAL */
641

Andreas Marek's avatar
Andreas Marek committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
         ! multiplication by blocks is efficient only for CPU
         ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
         ! CPU implementation) or by one large matrix Vector multiply
         if (.not. useGPU) then
           if (wantDebug) call obj%timer%start("blas")
           call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                       int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                       ONE, a_mat(l_row_beg, l_col_beg), int(matrixRows,kind=BLAS_KIND),         &
                       v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND,                           &
                       ONE, u_col(l_col_beg:max_local_cols), 1_BLAS_KIND)

           if (i/=j) then
             if (isSkewsymmetric) then
               call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                                   -ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),               &
                                   v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
                                   1_BLAS_KIND)

             else
               call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND),  &
                                   ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),               &
                                   v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
                                   1_BLAS_KIND)
             endif
           endif
           if (wantDebug) call obj%timer%stop("blas")
         endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
669

670
#endif /* WITH_OPENMP_TRADITIONAL */
671
            enddo  ! j=0,i
672
673
          enddo  ! i=0,(istep-2)/tile_size

674
          if (useGPU) then
675
            if (mat_vec_as_one_block) then
676
677
678
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
679
              if (wantDebug) call obj%timer%start("cublas")
680
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
681
                                        ONE, a_dev, matrixRows,                   &
682
683
684
685
686
687
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
688
!                                             ONE, a_dev + a_offset, matrixRows,                        &
689
690
691
692
693
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
694
              if (wantDebug) call obj%timer%stop("cublas")
695

696
            else  ! mat_vec_as_one_block
Andreas Marek's avatar
Andreas Marek committed
697
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
698
699
700
701
702
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
703

704
705
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
Andreas Marek's avatar
Andreas Marek committed
706

707
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
708
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
709

710
711
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
712
                              ONE, a_dev + a_offset, matrixRows,  &
713
714
715
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
Andreas Marek's avatar
Andreas Marek committed
716

717
718
719
720
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
721

722
723
724
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

725
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
726
                          size_of_datatype
Carolin Penke's avatar
Carolin Penke committed
727
728
                  if (isSkewsymmetric) then
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
729
                                 -ONE, a_dev + a_offset, matrixRows, &
Carolin Penke's avatar
Carolin Penke committed
730
731
732
733
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  else
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
734
                                 ONE, a_dev + a_offset, matrixRows, &
Carolin Penke's avatar
Carolin Penke committed
735
736
737
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  endif
Andreas Marek's avatar
Andreas Marek committed
738
739
740
              enddo
            end if !multiplication as one block / per stripes

Andreas Marek's avatar
Andreas Marek committed
741
742
            successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
                          u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
743
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
744

Andreas Marek's avatar
Andreas Marek committed
745
746
            successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
                          u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
747
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
748
          endif ! useGPU
749

750
#ifdef WITH_OPENMP_TRADITIONAL
Pavel Kus's avatar
Pavel Kus committed
751
!$OMP END PARALLEL
752
          call obj%timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
753

754
755
756
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
Andreas Marek's avatar
Andreas Marek committed
757
         enddo
758
#endif /* WITH_OPENMP_TRADITIONAL */
Pavel Kus's avatar
Pavel Kus committed
759

Andreas Marek's avatar
Andreas Marek committed
760
761
762
         ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
         if (n_stored_vecs > 0) then
           if (wantDebug) call obj%timer%start("blas")
763
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
764
           call PRECISION_GEMV('T',     &
765
766
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
767
           call PRECISION_GEMV('C',     &
768
#endif
Andreas Marek's avatar
Andreas Marek committed
769
770
771
                               int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                               ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
                               v_row,  1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND)
772

Andreas Marek's avatar
Andreas Marek committed
773
774
775
776
777
           call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                               ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),   &
                               aux, 1_BLAS_KIND, ONE, u_col,  1_BLAS_KIND)
           if (wantDebug) call obj%timer%stop("blas")
         endif
Pavel Kus's avatar
Pavel Kus committed
778

Andreas Marek's avatar
Andreas Marek committed
779
       endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
780

Andreas Marek's avatar
Andreas Marek committed
781
782
783
784
       ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
       ! on the processors containing the diagonal
       ! This is only necessary if u_row has been calculated, i.e. if the
       ! global tile size is smaller than the global remaining matrix
Pavel Kus's avatar
Pavel Kus committed
785

Andreas Marek's avatar
Andreas Marek committed
786
       if (tile_size < istep-1) then
787

Andreas Marek's avatar
Andreas Marek committed
788
789
790
791
792
793
         call elpa_reduce_add_vectors_&
         &MATH_DATATYPE&
         &_&
         &PRECISION &
         (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
         mpi_comm_cols, istep-1, 1, nblk, max_threads)
794

Andreas Marek's avatar
Andreas Marek committed
795
       endif
Pavel Kus's avatar
Pavel Kus committed
796

Andreas Marek's avatar
Andreas Marek committed
797
       ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
798

Andreas Marek's avatar
Andreas Marek committed
799
800
       if (l_cols>0) then
         tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
801
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
802
803
804
805
         if (wantDebug) call obj%timer%start("mpi_communication")
         call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                            MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
         if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
806
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
807
         u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
808
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
       endif
       if (isSkewsymmetric) then
          call elpa_transpose_vectors_ss_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
          (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
           mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
       else
          call elpa_transpose_vectors_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
          (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
           mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
       endif

       ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
       x = 0
       if (l_cols>0)  &
       x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
830

Pavel Kus's avatar
Pavel Kus committed
831
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
832
833
834
       if (wantDebug) call obj%timer%start("mpi_communication")
       call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
       if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
835
#else /* WITH_MPI */
836

Andreas Marek's avatar
Andreas Marek committed
837
       vav = x
838

Pavel Kus's avatar
Pavel Kus committed
839
840
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
841
842
       ! store u and v in the matrices U and V
       ! these matrices are stored combined in one here
Pavel Kus's avatar
Pavel Kus committed
843

Andreas Marek's avatar
Andreas Marek committed
844
       do j=1,l_rows
845
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
846
847
         vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
         vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
848
849
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
850
851
         vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
         vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
852
#endif
Andreas Marek's avatar
Andreas Marek committed
853
854
       enddo
       do j=1,l_cols
855
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
856
857
         uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
         uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
858
859
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
860
861
         uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
         uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
862
#endif
Andreas Marek's avatar
Andreas Marek committed
863
       enddo
Pavel Kus's avatar
Pavel Kus committed
864

Andreas Marek's avatar
Andreas Marek committed
865
866
       ! We have calculated another Hauseholder Vector, number of implicitly stored increased
       n_stored_vecs = n_stored_vecs+1
867

Andreas Marek's avatar
Andreas Marek committed
868
869
       ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
       if (n_stored_vecs == max_stored_uv .or. istep == 3) then
870

Andreas Marek's avatar
Andreas Marek committed
871
872
873
874
875
         if (useGPU) then
           successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
                                     max_local_rows * 2 * max_stored_uv *          &
                                     size_of_datatype, cudaMemcpyHostToDevice)
           check_memcpy_cuda("tridiag: uv_stored_rows_dev", successCUDA)
876

Andreas Marek's avatar
Andreas Marek committed
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
           successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
                                     max_local_cols * 2 * max_stored_uv *          &
                                     size_of_datatype, cudaMemcpyHostToDevice)
           check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
         endif

         do i = 0, (istep-2)/tile_size
           ! go over tiles above (or on) the diagonal
           l_col_beg = i*l_cols_per_tile+1
           l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
           l_row_beg = 1
           l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
           if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
           cycle


           if (useGPU) then
             if (.not. mat_vec_as_one_block) then
               ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
               ! we than use the same calls as for CPU version
               if (wantDebug) call obj%timer%start("cublas")
               call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
                                         l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
                                         ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
                                         size_of_datatype,  &
                                         max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
                                         size_of_datatype,  &
                                         max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * matrixRows) *     &
                                         size_of_datatype , matrixRows)
               if (wantDebug) call obj%timer%stop("cublas")
             endif
           else !useGPU
             if (wantDebug) call obj%timer%start("blas")
             call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
                                  int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
                                  int(2*n_stored_vecs,kind=BLAS_KIND),    &
                                  ONE, vu_stored_rows(l_row_beg:max_local_rows,1:2*max_stored_uv), &
                                  int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
                                  uv_stored_cols(l_col_beg,1), &
                                  int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),        &
                                  ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND))
             if (wantDebug) call obj%timer%stop("blas")
           endif !useGPU
         enddo

         if (useGPU) then
           if (mat_vec_as_one_block) then
             !update whole (remaining) part of matrix, including tiles below diagonal
             !we can do that in one large cublas call
             if (wantDebug) call obj%timer%start("cublas")
             call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs,   &
                                       ONE, vu_stored_rows_dev, max_local_rows, &
                                       uv_stored_cols_dev, max_local_cols,  &
                                       ONE, a_dev, matrixRows)
             if (wantDebug) call obj%timer%stop("cublas")
           endif
         endif
Pavel Kus's avatar
Pavel Kus committed
934

Andreas Marek's avatar
Andreas Marek committed
935
936
         n_stored_vecs = 0
       endif
Pavel Kus's avatar
Pavel Kus committed
937

Andreas Marek's avatar
Andreas Marek committed
938
939
940
941
942
943
944
945
946
947
948
949
950
951
       if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
         if (useGPU) then
           !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
            a_offset = ((l_rows - 1) + matrixRows * (l_cols - 1)) * size_of_datatype

            successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
                                      1 *  size_of_datatype, cudaMemcpyDeviceToHost)
            check_memcpy_cuda("tridiag: a_dev 3", successCUDA)

         endif
         if (n_stored_vecs > 0) then
           a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
                       + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
         end if
952
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
953
954
955
956
957
         if (isSkewsymmetric) then
           d_vec(istep-1) = 0.0_rk
         else
           d_vec(istep-1) = a_mat(l_rows,l_cols)
         endif
958
959
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
960
         d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk)
961
#endif
962

Andreas Marek's avatar
Andreas Marek committed
963
964
965
966
         if (useGPU) then
           !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
           !successCUDA = cuda_threadsynchronize()
           !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
Andreas Marek's avatar
Andreas Marek committed
967

Andreas Marek's avatar
Andreas Marek committed
968
969
970
971
972
           successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
                                     int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
           check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
         endif
       endif
Pavel Kus's avatar
Pavel Kus committed
973

Andreas Marek's avatar
Andreas Marek committed
974
     enddo ! main cycle over istep=na,3,-1
Pavel Kus's avatar
Pavel Kus committed
975

976
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
     ! Store e_vec(1) and d_vec(1)

     if (my_pcol==pcol(2, nblk, np_cols)) then
      if (my_prow==prow(1, nblk, np_rows)) then
       ! We use last l_cols value of loop above
       if (useGPU) then
         successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
                                   1 * size_of_datatype, cudaMemcpyDeviceToHost)
         check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
         vrl = aux3(1)
       else !useGPU
         vrl = a_mat(1,l_cols)
       endif !useGPU
       call hh_transform_complex_&
       &PRECISION &
       (obj, vrl, 0.0_rk, xf, tau(2), wantDebug)
993
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
994
       e_vec(1) = vrl
995
996
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
997
       e_vec(1) = real(vrl,kind=rk)
998
#endif
Andreas Marek's avatar
Andreas Marek committed
999
1000
       a_mat(1,l_cols) = 1. ! for consistency only
     endif
1001
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
1002
1003
1004
1005
     if (wantDebug) call obj%timer%start("mpi_communication")
     call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(prow(1, nblk, np_rows),kind=MPI_KIND), &
                   int(mpi_comm_rows,kind=MPI_KIND), mpierr)
     if (wantDebug) call obj%timer%stop("mpi_communication")
1006
1007

#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
1008
   endif
1009
1010

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
1011
1012
1013
1014
   if (wantDebug) call obj%timer%start("mpi_communication")
   call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(pcol(2, nblk, np_cols),kind=MPI_KIND), &
                  int(mpi_comm_cols,kind=MPI_KIND), mpierr)
   if (wantDebug) call obj%timer%stop("mpi_communication")
1015
1016

#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
  if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
    if (useGPU) then
      successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
                               1 * size_of_datatype, cudaMemcpyDeviceToHost)
      check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
      d_vec(1) = PRECISION_REAL(aux3(1))
    else !useGPU
      d_vec(1) = PRECISION_REAL(a_mat(1,1))
    endif !useGPU
  endif
1027
1028
1029
1030

#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
  ! Store e_vec(1)

  if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
    if (useGPU) then
      successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
                                1 * size_of_datatype, cudaMemcpyDeviceToHost)
      check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
    else !useGPU
      e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
    endif !useGPU
  endif
1042

Andreas Marek's avatar
Andreas Marek committed
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
  ! Store d_vec(1)
  if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
    if(useGPU) then
      successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
      check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
    else !useGPU
      if (isSkewsymmetric) then
        d_vec(1) = 0.0_rk
      else
        d_vec(1) = a_mat(1,1)
1053
      endif
Andreas Marek's avatar
Andreas Marek committed
1054
1055
    endif !useGPU
  endif
1056
#endif
Pavel Kus's avatar
Pavel Kus committed
1057

Andreas Marek's avatar
Andreas Marek committed
1058
1059
  deallocate(tmp, stat=istat, errmsg=errorMessage)
  check_deallocate("tridiag: tmp", istat, errorMessage)
1060

Andreas Marek's avatar
Andreas Marek committed
1061
1062
1063
1064
  if (useGPU) then
    ! todo: should we leave a_mat on the device for further use?
    successCUDA = cuda_free(a_dev)
    check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
1065