Commit 74b2a795 authored by Pavel Kus's avatar Pavel Kus

gpu tridiagonalization can be made using larger matrix-vector

multiplies. The older (stripe) version also remains, controled
by a parameter
parent c8b27c48
......@@ -123,6 +123,7 @@
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, my_rank
......@@ -489,41 +490,6 @@
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
#if REALCASE == 1
! if (useGPU) then
!u_col_dev(1:l_cols) = 0.
!u_row_dev(1:l_rows) = 0.
!v_col_dev(1:l_cols) = v_col(1:l_cols)
!v_row_dev(1:l_rows) = v_row(1:l_rows)
! do i=0,(istep-2)/tile_size
! l_col_beg = i*l_cols_per_tile+1
! l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
! if(l_col_end<l_col_beg) cycle
! do j=0,i
! l_row_beg = j*l_rows_per_tile+1
! l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
! if(l_row_end<l_row_beg) cycle
! if(mod(n_iter,n_threads) == my_thread) then
! call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
! if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
! endif
! n_iter = n_iter+1
! enddo
! enddo
!
!--- for now, just use DSYMV!!!
! a_dev -> a_mat ?
!write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
! call PRECISION_SYMV('U', l_cols, &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
!u_col(1:l_cols) = u_col_dev(1:l_cols)
!u_row(1:l_rows) = u_row_dev(1:l_rows)
! else !do not use GPU
#endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
......@@ -533,6 +499,7 @@
n_iter = 0
! first calculate A*v part of (A + VU**T + UV**T)*v
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif /* WITH_OPENMP */
......@@ -563,27 +530,10 @@
n_iter = n_iter+1
#else /* WITH_OPENMP */
if (useGPU) then
! a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_datatype
! call timer%start("cublas")
! call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
! l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1)
!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
! call timer%stop("cublas")
else ! useGPU
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix vector multiply
if (.not. useGPU) then
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
......@@ -599,14 +549,37 @@
ONE, u_row(l_row_beg), 1)
endif
call timer%stop("blas")
endif ! useGPU
endif ! not useGPU
#endif /* WITH_OPENMP */
enddo ! j=0,i
enddo ! i=0,(istep-2)/tile_size
if (useGPU) then
! perform multiplication by stripes - it is faster than by blocks
if(mat_vec_as_one_block) then
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
call timer%start("cublas")
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
ONE, a_dev, lda, &
v_row_dev , 1, &
ONE, u_col_dev, 1)
! todo: try with non transposed!!!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
call timer%stop("cublas")
else
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
!larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
......@@ -641,6 +614,7 @@
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
......@@ -666,6 +640,7 @@
enddo
#endif /* WITH_OPENMP */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
if (n_stored_vecs > 0) then
call timer%start("blas")
#if REALCASE == 1
......@@ -798,6 +773,7 @@
endif
do i = 0, (istep-2)/tile_size
! go over tiles above (or on) the diagonal
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
l_row_beg = 1
......@@ -805,7 +781,11 @@
if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
cycle
if (useGPU) then
if(.not. mat_vec_as_one_block) then
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
......@@ -816,6 +796,7 @@
max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * &
size_of_datatype , lda)
call timer%stop("cublas")
endif
else !useGPU
call timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
......@@ -827,8 +808,20 @@
endif !useGPU
enddo
n_stored_vecs = 0
if (useGPU) then
if(mat_vec_as_one_block) then
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev, max_local_rows, &
uv_stored_cols_dev, max_local_cols, &
ONE, a_dev, lda)
call timer%stop("cublas")
endif
endif
n_stored_vecs = 0
endif
if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment