Commit 39d32d3e authored by Pavel Kus's avatar Pavel Kus
Browse files

complex version of elpa1_trans_ev ported to GPU, seems to work OK

Conflicts:
	src/elpa1_trans_ev_complex_template.X90
parent b8a36fee
......@@ -13,6 +13,7 @@ blas_tokens = ["PRECISION_GEMV",
"PRECISION_TRMM",
"PRECISION_HERK",
"cublas_PRECISION_gemm",
"cublas_PRECISION_trmm",
"cublas_PRECISION_gemv",
]
......
......@@ -87,7 +87,8 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine trans_ev_complex_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -115,11 +116,15 @@
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, nstor
integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
#ifdef HAVE_DETAILED_TIMINGS
......@@ -144,46 +149,25 @@
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating tmat "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating h1 "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating h2 "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating tmp1 "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating tmp2 "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating hvb "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when allocating hvm "//errorMessage
stop
endif
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
......@@ -191,12 +175,37 @@
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
nstor = 0
if (useGPU) then
hvn_ubnd = 0
endif
! In the complex case tau(2) /= 0
if (my_prow == prow(1, nblk, np_rows)) then
q(1,1:l_cols) = q(1,1:l_cols)*(CONE-tau(2))
endif
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
do istep=1,na,nblk
ics = MAX(istep,3)
......@@ -238,6 +247,9 @@
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
if (useGPU) then
hvm_ubnd = l_rows
endif
nstor = nstor+1
nb = nb+l_rows
enddo
......@@ -273,46 +285,111 @@
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1
call PRECISION_TRMV('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1)
call PRECISION_TRMV('L', 'C', 'N', n, &
tmat, max_stored_rows, &
h2(nc+1),1)
tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
enddo
if (useGPU) then
! todo: is this reshape really neccessary?
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
call PRECISION_GEMM('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1 ,nstor)
else
tmp1(1:l_cols*nstor) = 0
endif
if(useGPU) then
call cublas_PRECISION_gemm('C', 'N', nstor, l_cols, l_rows, &
CONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, &
CZERO, tmp_dev, nstor)
else !useGPU
call PRECISION_GEMM('C', 'N', nstor, l_cols, l_rows, &
CONE, hvm, ubound(hvm,dim=1), &
q, ldq, &
CZERO, tmp1 ,nstor)
endif !useGPU
else !l_rows>0
if (useGPU) then
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_complex)
check_memcpy_cuda("trans_ev", successCUDA)
else
tmp1(1:l_cols*nstor) = 0
endif
endif !l_rows>0
#ifdef WITH_MPI
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
if (l_rows>0) then
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
endif
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
#else
! tmp2 = tmp1
#endif
if (l_rows>0) then
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
if(useGPU) then
call cublas_PRECISION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call cublas_PRECISION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
-CONE, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
CONE, q_dev, ldq)
else !useGPU
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat, max_stored_rows, &
tmp2, nstor)
!q = q - hvm*tmp2
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
CONE, q, ldq)
#else
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat, max_stored_rows, &
tmp1, nstor)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
CONE, q, ldq)
#endif
endif ! useGPU
endif ! l_rows>0
! if (l_rows>0) then
! call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
......@@ -321,9 +398,9 @@
! endif
nstor = 0
endif
enddo
endif
enddo ! istep=1,na,nblk
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -331,6 +408,32 @@
stop
endif
if (useGPU) then
!q = q_dev
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_complex: error when deallocating hvm1 "//errorMessage
stop
endif
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successCUDA = cuda_free(q_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmp_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(hvm_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmat_dev)
check_dealloc_cuda("trans_ev", successCUDA)
endif
call timer%stop("trans_ev_complex" // PRECISION_SUFFIX)
end subroutine trans_ev_complex_PRECISION
......@@ -362,7 +362,7 @@
-M_CONST_1_0, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
M_CONST_1_0, q_dev, ldq)
else
else !useGPU
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call M_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
......
......@@ -617,7 +617,6 @@
endif !useGPU
call hh_transform_complex_PRECISION(vrl, CONST_REAL_0_0, xf, tau(2))
e(1) = vrl
write(*,*) "aux3 ", aux3(1), ", vrl ", vrl, ", e(1) ", e(1)
a(1,l_cols) = 1. ! for consistency only
endif
......
......@@ -11,6 +11,7 @@
#define PRECISION_TRMM ZTRMM
#define PRECISION_HERK ZHERK
#define cublas_PRECISION_gemm cublas_Zgemm
#define cublas_PRECISION_trmm cublas_Ztrmm
#define cublas_PRECISION_gemv cublas_Zgemv
#define PRECISION_SUFFIX "_double"
#define MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
......@@ -34,6 +35,7 @@
#undef PRECISION_TRMM
#undef PRECISION_HERK
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_trmm
#undef cublas_PRECISION_gemv
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
......@@ -56,6 +58,7 @@
#define PRECISION_TRMM CTRMM
#define PRECISION_HERK CHERK
#define cublas_PRECISION_gemm cublas_Cgemm
#define cublas_PRECISION_trmm cublas_Ctrmm
#define cublas_PRECISION_gemv cublas_Cgemv
#define PRECISION_SUFFIX "_single"
#define MPI_COMPLEX_PRECISION MPI_COMPLEX
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment