Commit 6dac0853 authored by Pavel Kus's avatar Pavel Kus

tridiag_real from elpa1 ported to GPU. So far, performance seems to be much worse than on CPU

Conflicts:
	src/elpa1_auxiliary.F90
	src/elpa1_tridiag_real_template.X90
parent c2c83c2f
......@@ -596,8 +596,8 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
print *,"Warning: using GPU with blocksize different from 128"
! error stop
endif
! set the neccessary parameters
......
......@@ -54,6 +54,8 @@
!> \brief Fortran module which provides helper routines for matrix calculations
module ELPA1_AUXILIARY
use elpa_utilities
implicit none
public :: elpa_mult_at_b_real_double !< Multiply double-precision real matrices A**T * B
......
......@@ -101,9 +101,6 @@ module ELPA1_COMPUTE
public :: trans_ev_complex_single ! Transform complex single-precision eigenvectors of a tridiagonal matrix back
#endif
public :: local_index ! Get local index of a block cyclic distributed matrix
public :: least_common_multiple ! Get least common multiple
public :: hh_transform_real_double
public :: hh_transform_real
public :: elpa_reduce_add_vectors_real_double
......
......@@ -228,82 +228,6 @@
end subroutine M_solve_secular_equation_PRECISSION
!-------------------------------------------------------------------------------
#ifndef ALREADY_DEFINED
integer function local_index(idx, my_proc, num_procs, nblk, iflag)
!-------------------------------------------------------------------------------
! local_index: returns the local index for a given global index
! If the global index has no local index on the
! processor my_proc behaviour is defined by iflag
!
! Parameters
!
! idx Global index
!
! my_proc Processor row/column for which to calculate the local index
!
! num_procs Total number of processors along row/column
!
! nblk Blocksize
!
! iflag Controls the behaviour if idx is not on local processor
! iflag< 0 : Return last local index before that row/col
! iflag==0 : Return 0
! iflag> 0 : Return next local index after that row/col
!-------------------------------------------------------------------------------
use precision
implicit none
integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag
integer(kind=ik) :: iblk
iblk = (idx-1)/nblk ! global block number, 0 based
if (mod(iblk,num_procs) == my_proc) then
! block is local, always return local row/col number
local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
else
! non local block
if (iflag == 0) then
local_index = 0
else
local_index = (iblk/num_procs)*nblk
if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
if (iflag>0) local_index = local_index + 1
endif
endif
end function local_index
#endif /* ALREADY_DEFINED */
#ifndef ALREADY_DEFINED
integer function least_common_multiple(a, b)
! Returns the least common multiple of a and b
! There may be more efficient ways to do this, we use the most simple approach
use precision
implicit none
integer(kind=ik), intent(in) :: a, b
do least_common_multiple = a, a*(b-1), a
if(mod(least_common_multiple,b)==0) exit
enddo
! if the loop is left regularly, least_common_multiple = a*b
end function least_common_multiple
#endif /* ALREADY_DEFINED */
subroutine M_hh_transform_real_PRECISSION(alpha, xnorm_sq, xf, tau)
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
! and returns the factor xf by which x has to be scaled.
......@@ -343,4 +267,3 @@
ALPHA = BETA
endif
end subroutine M_hh_transform_real_PRECISSION
#define ALREADY_DEFINED 1
......@@ -52,36 +52,35 @@
! distributed along with the original code in the file "COPYING".
#endif
#define check_cuda(success) call check_memcpy_CUDA(__FILE__, __LINE__, success)
!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
! Parameters
!
!> \param na Order of matrix
!>
!> \param a(lda,matrixCols) Distributed matrix which should be reduced.
!> Distribution is like in Scalapack.
!> Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!> a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda Leading dimension of a
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix
!>
!> \param mpi_comm_rows MPI-Communicator for rows
!> \param mpi_comm_cols MPI-Communicator for columns
!>
!> \param d(na) Diagonal elements (returned), identical on all processors
!>
!> \param e(na) Off-Diagonal elements (returned), identical on all processors
!>
!> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
!>
subroutine M_tridiag_real_PRECISSION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU)
!-------------------------------------------------------------------------------
! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
! (like Scalapack Routine PDSYTRD)
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! d(na) Diagonal elements (returned), identical on all processors
!
! e(na) Off-Diagonal elements (returned), identical on all processors
!
! tau(na) Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
......@@ -101,45 +100,55 @@
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,matrixCols)
#endif
integer(kind=ik), parameter :: max_stored_rows = 32
integer(kind=ik), parameter :: max_stored_uv = 32
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, my_rank
integer(kind=ik) :: mpierr
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, max_local_cols
! updated after each istep (in the main cycle) to contain number of
! local columns and rows of the remaining part of the matrix
integer(kind=ik) :: l_cols, l_rows
!integer(kind=ik) :: l_cols, l_rows
integer(kind=c_size_t) :: l_cols, l_rows
! number of local columns used for allocation of a_dev
! pkus: isn't it the same as matrixCols?
integer(kind=ik) :: na_cols
integer(kind=C_intptr_T) :: a_dev
integer(kind=C_intptr_T) :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, uv_stored_cols_dev
logical :: successCUDA
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: nstor
integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
integer(kind=ik) :: n_stored_vecs
integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
integer(kind=c_size_t) :: a_offset
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter
integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real(kind=REAL_DATATYPE) :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
real(kind=REAL_DATATYPE) :: vav, vnorm2, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
vr(:), &
vc(:), &
ur(:), &
uc(:), &
vur(:,:), &
uvc(:,:)
v_row(:), & ! used to store calculated Householder vector
v_col(:), & ! the same vector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
real(kind=REAL_DATATYPE), allocatable :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
real(kind=REAL_DATATYPE), allocatable :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
......@@ -161,121 +170,140 @@
call timer%stop("mpi_communication")
#endif
! pkus: what is the difference between na_cols and matrixCols?
! pkus: probably matrixCols is not supplied when using
if (useGPU) then
#ifdef WITH_MPI
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_cols = na
#endif
write(*,*) "na_cols", na_cols, "matrixCols", matrixCols
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! pkus: what is tile size exactly?
! pkus: cannot it be larger than na?
! seems that tile is a square submatrix, consisting by several blocks
! it is a smallest possible square submatrix, where blocks being distributed among
! processors are "aligned" in both rows and columns
! -----------------
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- ...
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- .
! : : : : : : .
! : : : : : : .
!
! this is a tile, where each number represents block, assigned to a processor with the shown number
! size of this small block is nblk
! Image is for situation with 6 processors, 3 processor rows and 2 columns
! tile_size is thus nblk * 6
!
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = (totalblocks-1)/np_cols + 1
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
max_loc_block_rows = (totalblocks-1)/np_rows + 1
max_loc_block_cols = (totalblocks-1)/np_cols + 1
! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
max_local_rows = max_loc_block_rows*nblk
max_local_cols = max_loc_block_cols*nblk
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "tmp", istat, errorMessage)
allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vr "//errorMessage
stop
endif
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "v_row", istat, errorMessage)
allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating ur "//errorMessage
stop
endif
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "u_row", istat, errorMessage)
allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vc "//errorMessage
stop
endif
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "v_col", istat, errorMessage)
allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uc "//errorMessage
stop
endif
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "u_col", istat, errorMessage)
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating ur_p "//errorMessage
stop
endif
call check_alloc("tridiag_real", "ur_p", istat, errorMessage)
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uc_p "//errorMessage
stop
endif
call check_alloc("tridiag_real", "uc_p", istat, errorMessage)
#endif
tmp = 0
vr = 0
ur = 0
vc = 0
uc = 0
v_row = 0
u_row = 0
v_col = 0
u_col = 0
allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating vur "//errorMessage
stop
endif
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "vu_stored_rows", istat, errorMessage)
allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating uvc "//errorMessage
stop
endif
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_real", "uv_stored_cols", istat, errorMessage)
! if (useGPU) then
! allocate(vr_dev(max_local_rows))
! allocate(ur_dev(max_local_rows))
! allocate(vc_dev(max_local_cols))
! allocate(uc_dev(max_local_cols))
! allocate(vur_dev(max_local_rows,2*max_stored_rows))
! allocate(uvc_dev(max_local_cols,2*max_stored_rows))
! endif
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "v_row_dev", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "u_row_dev", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "v_col_dev", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "u_col_dev", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "vu_stored_rows_dev", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "uv_stored_cols_dev", successCUDA)
endif
d(:) = 0
e(:) = 0
tau(:) = 0
nstor = 0
n_stored_vecs = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) &
d(na) = a(l_rows,l_cols)
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * na_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), lda * na_cols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
endif
! if (useGPU) then
! allocate(a_dev(lda,na))
! a_dev = a
! endif
! main cycle of tridiagonalization
! in each step, 1 Householder vector is calculated
do istep=na,3,-1
do istep=na,3,-1
!call print_a(1,1)
!call print_a_dev(1,1)
!write(*,*) "step", istep, "l rows cols ", l_rows, l_cols
! Calculate number of local rows and columns of the still remaining matrix
! on the local processor
......@@ -289,21 +317,33 @@
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
! if (useGPU) then
! vr(1:l_rows) = a_dev(1:l_rows,l_cols+1)
! else
vr(1:l_rows) = a(1:l_rows,l_cols+1)
! endif
if(nstor>0 .and. l_rows>0) then
call M_PRECISSION_GEMV('N', l_rows, 2*nstor, M_CONST_1_0, vur, ubound(vur,dim=1), &
uvc(l_cols+1,1), ubound(uvc,dim=1), M_CONST_1_0, vr, 1)
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * M_size_of_PRECISSION_real
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*M_size_of_PRECISSION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)*M_size_of_PRECISSION_real, &
cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
else
v_row(1:l_rows) = a(1:l_rows,l_cols+1)
endif
!if((my_prow == 1) .and. (my_pcol == 1)) write(*,'(2I2, A, 20G14.3)'), l_rows, l_cols, "v_row:", v_row(:)
!write(*,*) "upper bounds: ", ubound(vu_stored_rows,dim=1), ubound(uv_stored_cols,dim=1), max_local_rows, max_local_cols
if(n_stored_vecs>0 .and. l_rows>0) then
call M_PRECISSION_GEMV('N', l_rows, 2*n_stored_vecs, &
M_CONST_1_0, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
M_CONST_1_0, v_row, 1)
endif
if(my_prow==prow(istep-1, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
aux1(2) = vr(l_rows)
aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
aux1(2) = v_row(l_rows)
else
aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
aux1(2) = 0.
endif
......@@ -323,170 +363,232 @@
! Householder transformation
call M_hh_transform_real_PRECISSION(vrl, vnorm2, xf, tau(istep))
! Scale vr and store Householder vector for back transformation
vr(1:l_rows) = vr(1:l_rows) * xf
! Scale v_row and store Householder vector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf
if(my_prow==prow(istep-1, nblk, np_rows)) then
vr(l_rows) = 1.
v_row(l_rows) = 1.
! vrl is newly computed off-diagonal element of the final tridiagonal matrix
e(istep-1) = vrl
endif
a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation
endif
! store Householder vector for back transformation
a(1:l_rows,l_cols+1) = v_row(1:l_rows)
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
! add tau after the end of actuall v_row, to be broadcasted with it
v_row(l_rows+1) = tau(istep)
endif !(my_pcol==pcol(istep, nblk, np_cols))
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(vr, l_rows+1, M_MPI_REAL_PRECISSION, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! Broadcast the Householder vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, M_MPI_REAL_PRECISSION, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif /* WITH_MPI */
tau(istep) = vr(l_rows+1)
! Transpose Householder vector vr -> vc
call M_elpa_transpose_vectors_real_PRECISSION (vr, ubound(vr,dim=1), mpi_comm_rows, &
vc, ubound(vc,dim=1), mpi_comm_cols, &
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
call M_elpa_transpose_vectors_real_PRECISSION (v_row, ubound(v_row,dim=1), mpi_comm_rows, &
v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! thus the result is partly in uc(:) and partly in ur(:)
! thus the result is partly in u_col(:) and partly in u_row(:)
uc(1:l_cols) = 0
ur(1:l_rows) = 0
u_col(1:l_cols) = 0
u_row(1:l_rows) = 0
if (l_rows>0 .and. l_cols>0) then
if(useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * M_size_of_PRECISSION_real)
check_cuda(successCUDA)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * M_size_of_PRECISSION_real)
check_cuda(successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
endif
! if (useGPU) then
!u_col_dev(1:l_cols) = 0.
!u_row_dev(1:l_rows) = 0.
!v_col_dev(1:l_cols) = v_col(1:l_cols)
!v_row_dev(1:l_rows) = v_row(1:l_rows)
! do i=0,(istep-2)/tile_size
! l_col_beg = i*l_cols_per_tile+1
! l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
! if(l_col_end<l_col_beg) cycle
! do j=0,i
! l_row_beg = j*l_rows_per_tile+1
! l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
! if(l_row_end<l_row_beg) cycle
! if(mod(n_iter,n_threads) == my_thread) then
! call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
! if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
! endif
! n_iter = n_iter+1
! enddo
! enddo
!
!--- for now, just use DSYMV!!!
! a_dev -> a ?
!write(*,*) "ubound ", ubound(a,1), "lda", lda, "lcols", l_cols
! call M_PRECISSION_SYMV('U', l_cols, &
! 1.d0, a, ubound(a,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
!u_col(1:l_cols) = u_col_dev(1:l_cols)
!u_row(1:l_rows) = u_row_dev(1:l_rows)
! else !do not use GPU
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
call timer%start("OpenMP parallel")
#endif
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
my_thread = omp_get_thread_num()