Commit 74b2a795 authored by Pavel Kus's avatar Pavel Kus

gpu tridiagonalization can be made using larger matrix-vector

multiplies. The older (stripe) version also remains, controled
by a parameter
parent c8b27c48
......@@ -123,6 +123,7 @@
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, my_rank
......@@ -375,7 +376,7 @@
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
......@@ -459,12 +460,12 @@
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
(v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
......@@ -489,41 +490,6 @@
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
#if REALCASE == 1
! if (useGPU) then
!u_col_dev(1:l_cols) = 0.
!u_row_dev(1:l_rows) = 0.
!v_col_dev(1:l_cols) = v_col(1:l_cols)
!v_row_dev(1:l_rows) = v_row(1:l_rows)
! do i=0,(istep-2)/tile_size
! l_col_beg = i*l_cols_per_tile+1
! l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
! if(l_col_end<l_col_beg) cycle
! do j=0,i
! l_row_beg = j*l_rows_per_tile+1
! l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
! if(l_row_end<l_row_beg) cycle
! if(mod(n_iter,n_threads) == my_thread) then
! call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
! if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
! endif
! n_iter = n_iter+1
! enddo
! enddo
!
!--- for now, just use DSYMV!!!
! a_dev -> a_mat ?
!write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
! call PRECISION_SYMV('U', l_cols, &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
!u_col(1:l_cols) = u_col_dev(1:l_cols)
!u_row(1:l_rows) = u_row_dev(1:l_rows)
! else !do not use GPU
#endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
......@@ -533,6 +499,7 @@
n_iter = 0
! first calculate A*v part of (A + VU**T + UV**T)*v
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif /* WITH_OPENMP */
......@@ -546,7 +513,7 @@
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call timer%start("blas")
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
......@@ -558,32 +525,15 @@
ONE, ur_p(l_row_beg,my_thread), 1)
endif
call timer%stop("blas")
call timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
if (useGPU) then
! a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_datatype
! call timer%start("cublas")
! call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
! l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1)
!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
! call timer%stop("cublas")
else ! useGPU
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix vector multiply
if (.not. useGPU) then
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
......@@ -598,50 +548,74 @@
v_col(l_col_beg), 1, &
ONE, u_row(l_row_beg), 1)
endif
call timer%stop("blas")
endif ! useGPU
call timer%stop("blas")
endif ! not useGPU
#endif /* WITH_OPENMP */
enddo ! j=0,i
enddo ! i=0,(istep-2)/tile_size
if (useGPU) then
! perform multiplication by stripes - it is faster than by blocks
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
l_row_beg = 1
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
size_of_datatype
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * size_of_datatype, 1, &
ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
enddo
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
if(mat_vec_as_one_block) then
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
call timer%start("cublas")
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
ONE, a_dev, lda, &
v_row_dev , 1, &
ONE, u_col_dev, 1)
! todo: try with non transposed!!!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, lda, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
call timer%stop("cublas")
else
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
!larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
l_row_beg = 1
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
size_of_datatype
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * size_of_datatype, 1, &
ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
enddo
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
l_row_beg = 1
l_row_end = min(l_rows,i*l_rows_per_tile)
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
size_of_datatype
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
enddo
end if !multiplication as one block / per stripes
l_row_beg = 1
l_row_end = min(l_rows,i*l_rows_per_tile)
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
size_of_datatype
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
enddo
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
......@@ -666,8 +640,9 @@
enddo
#endif /* WITH_OPENMP */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
if (n_stored_vecs > 0) then
call timer%start("blas")
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', &
#endif
......@@ -681,7 +656,7 @@
call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
aux, 1, ONE, u_col, 1)
call timer%stop("blas")
call timer%stop("blas")
endif
endif ! (l_rows>0 .and. l_cols>0)
......@@ -798,6 +773,7 @@
endif
do i = 0, (istep-2)/tile_size
! go over tiles above (or on) the diagonal
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
l_row_beg = 1
......@@ -805,17 +781,22 @@
if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
cycle
if (useGPU) then
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
size_of_datatype, &
max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) * &
size_of_datatype, &
max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * &
size_of_datatype , lda)
call timer%stop("cublas")
if(.not. mat_vec_as_one_block) then
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
size_of_datatype, &
max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) * &
size_of_datatype, &
max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * &
size_of_datatype , lda)
call timer%stop("cublas")
endif
else !useGPU
call timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
......@@ -826,9 +807,21 @@
call timer%stop("blas")
endif !useGPU
enddo
if (useGPU) then
if(mat_vec_as_one_block) then
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev, max_local_rows, &
uv_stored_cols_dev, max_local_cols, &
ONE, a_dev, lda)
call timer%stop("cublas")
endif
endif
n_stored_vecs = 0
endif
if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment