Commit 78d0a9aa authored by Pavel Kus's avatar Pavel Kus

first gemm ported to gpu

parent 6febaf23
......@@ -58,7 +58,8 @@
&PRECISION &
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
implicit none
......@@ -76,6 +77,8 @@
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
! TODO: play with max_strip. If it was larger, matrices being multiplied
! might be larger as well!
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
......@@ -101,7 +104,13 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_real
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads, omp_get_thread_num
......@@ -574,34 +583,51 @@
endif
enddo
allocate(ev(max_local_cols,MIN(max_strip,MAX(1,nqcols1))), stat=istat, errmsg=errorMessage)
gemm_dim_k = MAX(1,l_rows)
gemm_dim_l = max_local_cols
gemm_dim_m = MIN(max_strip,MAX(1,nqcols1))
allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"merge_systems: error when allocating ev "//errorMessage
print *,"merge_systems: error when allocating qtmp1 "//errorMessage
stop 1
endif
allocate(qtmp1(MAX(1,l_rows),max_local_cols), stat=istat, errmsg=errorMessage)
allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"merge_systems: error when allocating qtmp1 "//errorMessage
print *,"merge_systems: error when allocating ev "//errorMessage
stop 1
endif
allocate(qtmp2(MAX(1,l_rows),MIN(max_strip,MAX(1,nqcols1))), stat=istat, errmsg=errorMessage)
allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"merge_systems: error when allocating qtmp2 "//errorMessage
stop 1
endif
! if (useGPU) then
! allocate(qtmp1_dev(MAX(1,l_rows),max_local_cols))
! endif
qtmp1 = 0 ! May contain empty (unset) parts
qtmp2 = 0 ! Not really needed
if (useGPU) then
successCUDA = cuda_malloc(qtmp1_dev, gemm_dim_k * gemm_dim_l * size_of_datatype)
check_alloc_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_malloc(ev_dev, gemm_dim_l * gemm_dim_m * size_of_datatype)
check_alloc_cuda("merge_systems: ev_dev", successCUDA)
successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
successCUDA = cuda_memset(qtmp1_dev, 0, gemm_dim_k * gemm_dim_l * size_of_datatype)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_memset(qtmp2_dev, 0, gemm_dim_k * gemm_dim_m * size_of_datatype)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
! Gather nonzero upper/lower components of old matrix Q
! which are needed for multiplication with new eigenvectors
qtmp1 = 0 ! May contain empty (unset) parts
qtmp2 = 0 ! Not really needed
nnzu = 0
nnzl = 0
do i = 1, na1
......@@ -640,6 +666,7 @@
np_rem = my_pcol
do np = 1, npc_n
write(*,*) "outer loop ", 1, npc_n
! Do a ring send of qtmp1
......@@ -659,9 +686,11 @@
#endif /* WITH_MPI */
endif
! if (useGPU) then
! qtmp1_dev(:,:) = qtmp1(:,:)
! endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
! Gather the parts in d1 and z which are fitting to qtmp1.
! This also delivers nnzu/nnzl for proc np_rem
......@@ -698,6 +727,7 @@
enddo
do ns = 0, nqcols1-1, max_strip ! strimining loop
write(*, *) "inner loop ", 0, nqcols1-1, max_strip
ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
......@@ -710,6 +740,7 @@
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with upper half of Q:
call obj%timer%start("strange_loop")
do i = 1, ncnt
j = idx(idxq1(i+ns))
! Calculate the j-th eigenvector of the deflated system
......@@ -720,28 +751,56 @@
&(obj,tmp,nnzu,ddiff(j))
ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
enddo
call obj%timer%stop("strange_loop")
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
! Multiply old Q with eigenvectors (upper half)
! if (useGPU) then
! if(l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
! call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1_dev,ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(1,1),ubound(qtmp2,1))
! else
call obj%timer%start("blas")
call obj%timer%start("gemm-first")
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
!write(*,*) "merging-first", l_rnm, ncnt, nnzu
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1, ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
call obj%timer%stop("gemm-first")
call obj%timer%stop("blas")
! endif ! useGPU
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
write(*,*) "merging-first", l_rnm, ncnt, nnzu
if (useGPU) then
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
ev_dev, ubound(ev,dim=1), &
1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1))
call obj%timer%stop("cublas")
else
call obj%timer%start("blas")
call obj%timer%start("gemm")
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1, ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
call obj%timer%stop("gemm")
call obj%timer%stop("blas")
endif ! useGPU
endif
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
!TODO: actually this will be done after the second mutiplication
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q:
call obj%timer%start("strange_loop")
do i = 1, ncnt
j = idx(idxq1(i+ns))
! Calculate the j-th eigenvector of the deflated system
......@@ -752,6 +811,7 @@
&(obj,tmp,nnzl,ddiff(j))
ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
enddo
call obj%timer%stop("strange_loop")
! Multiply old Q with eigenvectors (lower half)
......@@ -785,8 +845,17 @@
print *,"merge_systems: error when deallocating ev "//errorMessage
stop 1
endif
endif
if(useGPU) then
successCUDA = cuda_free(qtmp1_dev)
check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_free(qtmp2_dev)
check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA)
successCUDA = cuda_free(ev_dev)
check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
endif
endif !very outer test (na1==1 .or. na1==2)
#ifdef WITH_OPENMP
deallocate(z_p, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment