Commit 927d3954 authored by Pavel Kus's avatar Pavel Kus

fixing allocation of tmat_dev

so that it works if one of bandred / band_to_full steps are done on GPU
and the other one on CPU
parent 7bcb325b
...@@ -263,14 +263,6 @@ ...@@ -263,14 +263,6 @@
stop 1 stop 1
endif endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype) successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
......
...@@ -502,6 +502,8 @@ ...@@ -502,6 +502,8 @@
num_blocks = (na-1)/nbw + 1 num_blocks = (na-1)/nbw + 1
! tmat is needed only in full->band and band->full steps, so alocate here
! (not allocated for banded matrix on input)
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"solve_evp_& print *,"solve_evp_&
...@@ -512,6 +514,18 @@ ...@@ -512,6 +514,18 @@
stop 1 stop 1
endif endif
! if either of full->band or band->full steps are to be done on GPU,
! allocate also corresponding array on GPU.
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
endif
do_bandred = .true. do_bandred = .true.
do_solve_tridi = .true. do_solve_tridi = .true.
do_trans_to_band = .true. do_trans_to_band = .true.
...@@ -685,6 +699,10 @@ ...@@ -685,6 +699,10 @@
! to transfer q to the host ! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
endif
endif endif
! if the last step is not required at all, or will be performed on CPU, ! if the last step is not required at all, or will be performed on CPU,
...@@ -704,6 +722,10 @@ ...@@ -704,6 +722,10 @@
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype) successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
endif
endif endif
! Backtransform stage 2 ! Backtransform stage 2
...@@ -733,6 +755,16 @@ ...@@ -733,6 +755,16 @@
call obj%timer%stop("trans_ev_to_full") call obj%timer%stop("trans_ev_to_full")
endif ! do_trans_to_full endif ! do_trans_to_full
if(do_bandred .or. do_trans_to_full) then
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"elpa2_template: error in cudaFree, tmat_dev"
stop 1
endif
endif
endif
if (obj%eigenvalues_only) then if (obj%eigenvalues_only) then
deallocate(q_dummy, stat=istat, errmsg=errorMessage) deallocate(q_dummy, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
......
...@@ -349,7 +349,7 @@ ...@@ -349,7 +349,7 @@
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy" print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
stop 1 stop 1
endif endif
...@@ -371,7 +371,7 @@ ...@@ -371,7 +371,7 @@
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce) ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy" print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1 stop 1
endif endif
...@@ -403,7 +403,7 @@ ...@@ -403,7 +403,7 @@
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_& print *,"trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
&: error in cudaMemcpy" &: error in cudaMemcpy, tmp2"
stop 1 stop 1
endif endif
#else /* WITH_MPI */ #else /* WITH_MPI */
...@@ -419,7 +419,7 @@ ...@@ -419,7 +419,7 @@
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_& print *,"trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
&: error in cudaMemcpy" &: error in cudaMemcpy, tmat"
stop 1 stop 1
endif endif
!#endif /* WITH_MPI */ !#endif /* WITH_MPI */
...@@ -436,7 +436,7 @@ ...@@ -436,7 +436,7 @@
! this is not necessary hvm is not used anymore ! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy" print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1 stop 1
endif endif
endif ! l_rows > 0 endif ! l_rows > 0
...@@ -777,13 +777,6 @@ ...@@ -777,13 +777,6 @@
stop 1 stop 1
endif endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaFree"
stop 1
endif
! final transfer of q_dev ! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment