Commit 927d3954 authored by Pavel Kus's avatar Pavel Kus

fixing allocation of tmat_dev

so that it works if one of bandred / band_to_full steps are done on GPU
and the other one on CPU
parent 7bcb325b
......@@ -263,14 +263,6 @@
stop 1
endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
......
......@@ -502,6 +502,8 @@
num_blocks = (na-1)/nbw + 1
! tmat is needed only in full->band and band->full steps, so alocate here
! (not allocated for banded matrix on input)
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
......@@ -512,6 +514,18 @@
stop 1
endif
! if either of full->band or band->full steps are to be done on GPU,
! allocate also corresponding array on GPU.
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
endif
do_bandred = .true.
do_solve_tridi = .true.
do_trans_to_band = .true.
......@@ -685,6 +699,10 @@
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
endif
endif
! if the last step is not required at all, or will be performed on CPU,
......@@ -704,6 +722,10 @@
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
endif
endif
! Backtransform stage 2
......@@ -733,6 +755,16 @@
call obj%timer%stop("trans_ev_to_full")
endif ! do_trans_to_full
if(do_bandred .or. do_trans_to_full) then
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"elpa2_template: error in cudaFree, tmat_dev"
stop 1
endif
endif
endif
if (obj%eigenvalues_only) then
deallocate(q_dummy, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -349,7 +349,7 @@
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
stop 1
endif
......@@ -371,7 +371,7 @@
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1
endif
......@@ -403,7 +403,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy, tmp2"
stop 1
endif
#else /* WITH_MPI */
......@@ -419,7 +419,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy, tmat"
stop 1
endif
!#endif /* WITH_MPI */
......@@ -436,7 +436,7 @@
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1
endif
endif ! l_rows > 0
......@@ -777,13 +777,6 @@
stop 1
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaFree"
stop 1
endif
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment