Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
c91e8d18
Commit
c91e8d18
authored
Jun 14, 2016
by
Andreas Marek
Browse files
Fix error with GPU memcopies: too little was transfered
parent
746911c9
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/elpa2_compute_real_template.X90
View file @
c91e8d18
...
@@ -1442,12 +1442,15 @@
...
@@ -1442,12 +1442,15 @@
stop
stop
endif
endif
! this is not necessart tmat_dev is passed (unchanged) from one routine to the other
!#ifdef WITH_MPI
! successCUDA = cuda_free(tmat_dev)
! it should be possible to keep tmat dev on the device and not copy it arround
! if (.not.(successCUDA)) then
! this is not necessary tmat_dev is passed (unchanged) from one routine to the other
! print *,"bandred_real: error in cudaFree"
successCUDA = cuda_free(tmat_dev)
! stop
if (.not.(successCUDA)) then
! endif
print *,"bandred_real: error in cudaFree"
stop
endif
!#endif
successCUDA = cuda_free(vav_dev)
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
...
@@ -1756,16 +1759,20 @@
...
@@ -1756,16 +1759,20 @@
stop
stop
endif
endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it around
! already existent on GPU
! already existent on GPU
!#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
!#else
#else
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
!#endif
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_dev already living on device
! q_dev already living on device
!#ifdef DOUBLE_PRECISION_REAL
!#ifdef DOUBLE_PRECISION_REAL
...
@@ -1782,7 +1789,7 @@
...
@@ -1782,7 +1789,7 @@
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
!#ifdef DOUBLE_PRECISION_REAL
!#ifdef DOUBLE_PRECISION_REAL
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
!!
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
!#else
!#else
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
...
@@ -1969,18 +1976,21 @@
...
@@ -1969,18 +1976,21 @@
#endif /* WITH_MPI */
#endif /* WITH_MPI */
! already existend on GPU
!#ifdef WITH_MPI
!#ifdef DOUBLE_PRECISION_REAL
! it should be possible to keep tmat on the device and not copy it aroud
#ifdef DOUBLE_PRECISION_REAL
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
!#else
#else
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
!#endif
#endif
! if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
stop
! endif
endif
!#endif /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
...
@@ -1991,17 +2001,17 @@
...
@@ -1991,17 +2001,17 @@
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
#endif
#endif
!
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
!
! copy to host maybe this can be avoided
! copy to host maybe this can be avoided
!
! this is not necessary hvm is not used anymore
! this is not necessary hvm is not used anymore
!
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
!
#else
#else
!
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
!
#endif
#endif
!
if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
!
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
!
stop
stop
!
endif
endif
endif ! l_rows > 0
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
!#ifdef WITH_GPU_VERSION
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment