Commit c52e3557 authored by Andreas Marek's avatar Andreas Marek
Browse files

Pass q_dev to _band_to_full

parent 98fb8a3e
...@@ -405,9 +405,7 @@ ...@@ -405,9 +405,7 @@
&_& &_&
&PRECISION & &PRECISION &
(na, nev, nblk, nbw, q, & (na, nev, nblk, nbw, q, &
#if REALCASE == 1
q_dev, & q_dev, &
#endif
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, & ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, &
success, THIS_ELPA_KERNEL) success, THIS_ELPA_KERNEL)
...@@ -440,9 +438,7 @@ ...@@ -440,9 +438,7 @@
&PRECISION & &PRECISION &
(na, nev, nblk, nbw, a, & (na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q, & a_dev, lda, tmat, tmat_dev, q, &
#if REALCASE == 1
q_dev, & q_dev, &
#endif
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU & ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
#if REALCASE == 1 #if REALCASE == 1
, useQRActual & , useQRActual &
......
...@@ -55,9 +55,7 @@ ...@@ -55,9 +55,7 @@
&_& &_&
&PRECISION & &PRECISION &
(na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, & (na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, &
#if REALCASE == 1
q_dev, & q_dev, &
#endif
ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU & ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU &
#if REALCASE == 1 #if REALCASE == 1
,useQr) ,useQr)
...@@ -313,17 +311,17 @@ ...@@ -313,17 +311,17 @@
! endif ! endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex) ! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
if (.not.(successCUDA)) then ! if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc" ! print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop ! stop
endif ! endif
!
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice) ! successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then ! if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" ! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop ! stop
endif ! endif
#endif #endif
! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided ! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
......
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
&_& &_&
&PRECISION & &PRECISION &
(na, nev, nblk, nbw, q, & (na, nev, nblk, nbw, q, &
#if REALCASE == 1
q_dev, & q_dev, &
#endif
ldq, matrixCols, & ldq, matrixCols, &
#if REALCASE == 1 #if REALCASE == 1
hh_trans_real, & hh_trans_real, &
...@@ -87,8 +85,9 @@ ...@@ -87,8 +85,9 @@
#endif #endif
real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:) real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:)
integer(kind=c_intptr_t) :: q_dev
#endif #endif
integer(kind=c_intptr_t) :: q_dev
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*) complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
...@@ -2657,21 +2656,38 @@ ...@@ -2657,21 +2656,38 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
#if REALCASE == 1 !#if REALCASE == 1
! copy q to q_dev needed in trans_ev_band_to_full ! copy q to q_dev needed in trans_ev_band_to_full
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_real) successCUDA = cuda_malloc(q_dev, ldq*matrixCols* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" print *,"trans_ev_tridi_to_band_&
&MAATH_DATATYPE&
&: error in cudaMalloc"
stop stop
endif endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex, &
#endif
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop stop
endif endif
#endif !#endif
! deallocate all working space ! deallocate all working space
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment