Commit c581d326 authored by Andreas Marek's avatar Andreas Marek

Refactor GPU part in bandred

parent 82e195a5
......@@ -1447,7 +1447,6 @@
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
#if REALCASE == 1
if (useGPU) then
! here the GPU version and CPU version divereged due to the same reasons as above
......@@ -1456,23 +1455,54 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk)
#if REALCASE == 1
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows, &
#endif
#if COMPLEXCASE == 1
(vmrCUDA(1,n_cols+1),ubound(vmrCUDA,dim=1), &
#endif
mpi_comm_rows, umcCUDA, &
#if REALCASE == 1
cur_l_cols, &
#endif
#if COMPLEXCASE == 1
ubound(umcCUDA,dim=1), &
#endif
mpi_comm_cols, istep*nbw, n_cols, nblk)
endif
#ifdef WITH_MPI
if (l_cols>0) then
#if REALCASE == 1
allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
#endif
#if COMPLEXCASE == 1
allocate(tmpCUDA(l_cols,n_cols), stat=istat, errmsg=errorMessage)
#endif
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tmpCUDA "//errorMessage
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating tmpCUDA "//errorMessage
stop
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, ierr)
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, ierr)
#if REALCASE == 1
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
#endif
#if COMPLEXCASE == 1
umcCUDA(1:l_cols,1:n_cols) = tmpCUDA(1:l_cols,1:n_cols)
#endif
call timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -1483,57 +1513,121 @@
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage
print *,"bandred_&
&MATH_DATATYPE&
&: error when deallocating tmpCUDA "//errorMessage
stop
endif
endif
endif ! l_cols
! U = U * Tmat**T
#if REALCASE == 1
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
#if REALCASE == 1
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
call timer%start("cublas")
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols)
#if REALCASE == 1
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', &
#endif
l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
call timer%start("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
#if REALCASE == 1
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
call timer%start("cublas")
call cublas_PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, &
CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, &
CONST_0_0, vav_dev, nbw)
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_GEMM('C', 'N', &
#endif
n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
#if REALCASE == 1
(umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, &
#endif
#if COMPLEXCASE == 1
(umc_dev +( cur_l_cols *n_cols)*size_of_PRECISION_complex ), cur_l_cols, &
#endif
ZERO, vav_dev, nbw)
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, vav_dev, nbw)
#if REALCASE == 1
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', &
#endif
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call timer%stop("cublas")
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
#if REALCASE == 1
call symm_matrix_allreduce_&
#endif
#if COMPLEXCASE == 1
call herm_matrix_allreduce_&
#endif
&PRECISION &
(n_cols,vav, nbw,nbw,mpi_comm_cols)
#if REALCASE == 1
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
......@@ -1541,14 +1635,34 @@
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
-CONST_0_5, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, vav_dev,nbw,&
CONST_1_0, umc_dev, cur_l_cols)
#if REALCASE == 1
-CONST_0_5, &
#endif
#if COMPLEXCASE == 1
CONST_COMPLEX_PAIR_NEGATIVE_0_5, &
#endif
(umc_dev+(cur_l_cols * n_cols )* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cur_l_cols, vav_dev,nbw, &
ONE, umc_dev, cur_l_cols)
call timer%stop("cublas")
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(umcCUDA(1,1)),umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
......@@ -1557,19 +1671,40 @@
&MATH_DATATYPE&
&_&
&PRECISION &
#if REALCASE == 1
(umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
#endif
#if COMPLEXCASE == 1
(umcCUDA, ubound(umcCUDA,dim=1), mpi_comm_cols, &
vmrCUDA(1,n_cols+1), ubound(vmrCUDA,dim=1), mpi_comm_rows, &
#endif
1, istep*nbw, n_cols, nblk)
#if REALCASE == 1
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(vmr_dev,loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
#if REALCASE == 1
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(umc_dev,loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
......@@ -1581,15 +1716,34 @@
if (lce<lcs .or. lre<1) cycle
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'T', lre, lce-lcs+1, 2*n_cols, -CONST_1_0, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*size_of_PRECISION_real), cur_l_cols, &
CONST_1_0, (a_dev+(lcs-1)*lda*size_of_PRECISION_real), lda)
#if REALCASE == 1
call cublas_PRECISION_GEMM('N', 'T', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_GEMM('N', 'C', &
#endif
lre, lce-lcs+1, 2*n_cols, -ONE, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
#if REALCASE == 1
size_of_PRECISION_real), lda)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex),lda)
#endif
call timer%stop("cublas")
enddo
else ! do not useGPU
#if REALCASE == 1
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_&
......@@ -1708,62 +1862,13 @@
call timer%stop("blas")
enddo
#endif /* WITH_OPENMP */
#endif /* REALCASE == 1 */
endif ! useGPU
#endif /* REALCASE == 1 */
#if COMPLEXCASE == 1
if (useGPU) then
! here the GPU version and CPU version divereged due to the same reasons as above
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCUDA(1,n_cols+1),ubound(vmrCUDA,dim=1),mpi_comm_rows, &
umcCUDA, ubound(umcCUDA,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
#ifdef WITH_MPI
if (l_cols>0) then
allocate(tmpCUDA(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmpCUDA "//errorMessage
stop
endif
call timer%start("mpi_communication")
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
umcCUDA(1:l_cols,1:n_cols) = tmpCUDA(1:l_cols,1:n_cols)
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmpCUDA "//errorMessage
stop
endif
endif
#else /* WITH_MPI */
! if (l_cols>0) then
! allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when allocating tmp "//errorMessage
! stop
! endif
! tmp(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
!
! umcCPU(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! deallocate(tmp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when deallocating tmp "//errorMessage
! stop
! endif
! endif
#endif /* WITH_MPI */
else ! useGPU
......@@ -1820,23 +1925,6 @@
! U = U * Tmat**T
if (useGPU) then
!if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! print *,"bandred_complex: umcCPU size 4 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! stop
!endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed umc_dev ", istat
stop
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat
stop
endif
call timer%start("cublas")
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
call timer%stop("cublas")
else ! not useGPU
call timer%start("blas")
call PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), &
......@@ -1846,32 +1934,7 @@
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
call timer%start("cublas")
call cublas_PRECISION_GEMM('C', 'N', n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) &
*size_of_PRECISION_complex ), cur_l_cols, ZERO, vav_dev, nbw)
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav ", istat
stop
endif
call herm_matrix_allreduce_&
&PRECISION &
(n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
else ! useGPU
call timer%start("blas")
call PRECISION_GEMM('C', 'N', n_cols, n_cols, l_cols, ONE, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
......@@ -1887,48 +1950,6 @@
! U = U - 0.5 * V * VAV
if (useGPU) then
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, CONST_COMPLEX_PAIR_NEGATIVE_0_5, (umc_dev + &
(cur_l_cols * n_cols )*size_of_PRECISION_complex), &
cur_l_cols, vav_dev, nbw, ONE, umc_dev, cur_l_cols)
call timer%stop("cublas")
! Transpose umc -> umr (stored in vmr, second half)
! if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! print *,"bandred_complex: umcCPU size 5 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! stop
! endif
successCUDA = cuda_memcpy(loc(umcCUDA(1,1)),umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed umcCPU ", istat
stop
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCUDA, ubound(umcCUDA,dim=1), mpi_comm_cols, &
vmrCUDA(1,n_cols+1), ubound(vmrCUDA,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
! print *,"bandred_complex: vmr size 4 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
! stop
! endif
successCUDA = cuda_memcpy(vmr_dev,loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed vav_dev", istat
stop
endif
! if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! print *,"bandred_complex: umcCPU size 6 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! stop
! endif
successCUDA = cuda_memcpy(umc_dev,loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed umc_dev ", istat
stop
endif
else ! not useGPU
call timer%start("blas")
call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, CONST_COMPLEX_PAIR_NEGATIVE_0_5, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), &
......@@ -1951,11 +1972,6 @@
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
if (useGPU) then
call timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'C', lre, lce-lcs+1, 2*n_cols, -ONE, &
vmr_dev ,cur_l_rows, (umc_dev +(lcs-1)*size_of_PRECISION_complex),cur_l_cols, &
ONE, (a_dev + (lcs-1)*lda*size_of_PRECISION_complex),lda)
call timer%stop("cublas")
else
call timer%start("blas")
call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -ONE, &
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment