Commit 03ef1ee4 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove some unecessary cuda_memcpy in real case

The same should be done for the complex case
parent 29f31c21
......@@ -183,6 +183,7 @@ contains
integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: nbw, num_blocks
real(kind=rk8), allocatable :: tmat(:,:,:), e(:)
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=ik) :: i
logical :: success
......@@ -322,11 +323,11 @@ contains
ttt0 = MPI_Wtime()
ttts = ttt0
#ifdef DOUBLE_PRECISION_REAL
call bandred_real_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
#else
call bandred_real_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
......@@ -395,11 +396,11 @@ contains
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#else
call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#endif
......@@ -421,10 +422,10 @@ contains
print *,"useGPU== ",useGPU
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
#else
call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
#endif
......@@ -534,6 +535,7 @@ contains
integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: nbw, num_blocks
real(kind=rk4), allocatable :: tmat(:,:,:), e(:)
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=ik) :: i
logical :: success
......@@ -672,11 +674,11 @@ contains
ttt0 = MPI_Wtime()
ttts = ttt0
#ifdef DOUBLE_PRECISION_REAL
call bandred_real_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
#else
call bandred_real_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
......@@ -745,11 +747,11 @@ contains
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#else
call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#endif
......@@ -771,10 +773,10 @@ contains
print *,"useGPU== ",useGPU
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
#else
call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
#endif
......
......@@ -62,11 +62,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine bandred_real_double(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQR)
subroutine bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQR)
#else
subroutine bandred_real_single(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQR)
subroutine bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQR)
#endif
!-------------------------------------------------------------------------------
......@@ -135,6 +135,7 @@
real(kind=REAL_DATATYPE) :: dwork_size(1)
real(kind=REAL_DATATYPE), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
! a_dev is passed from bandred_real to trans_ev_band
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
......@@ -1240,7 +1241,9 @@
1.0_rk4, (a_dev+(lcs-1)*lda*size_of_single_real_datatype), lda)
#endif
enddo
else ! do not useGPU
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
#ifdef DOUBLE_PRECISION_REAL
......@@ -1422,6 +1425,7 @@
enddo ! istep
if (useGPU) then
! this is not needed since a_dev is passed along from one subroutine to the other
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy ( loc (a), a_dev, lda*na_cols*size_of_double_real_datatype,cudaMemcpyDeviceToHost)
#else
......@@ -1438,11 +1442,12 @@
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
! this is not necessart tmat_dev is passed (unchanged) from one routine to the other
! successCUDA = cuda_free(tmat_dev)
! if (.not.(successCUDA)) then
! print *,"bandred_real: error in cudaFree"
! stop
! endif
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
......@@ -1609,11 +1614,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine trans_ev_band_to_full_real_double(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQR)
subroutine trans_ev_band_to_full_real_double(na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, &
numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU, useQR)
#else
subroutine trans_ev_band_to_full_real_single(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQR)
subroutine trans_ev_band_to_full_real_single(na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, matrixCols, &
numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU, useQR)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real:
......@@ -1663,6 +1668,9 @@
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
......@@ -1671,6 +1679,10 @@
real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is passed along from bandred_real
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmp_dev, tmat_dev
integer(kind=ik) :: i
......@@ -1743,37 +1755,45 @@
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
! already existent on GPU
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_dev already living on device
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_real_datatype)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
!#ifdef DOUBLE_PRECISION_REAL
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_double_real_datatype)
#else
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_single_real_datatype)
......@@ -1868,7 +1888,11 @@
q_dev, ldq , 0.0_rk4, tmp_dev, n_cols)
#endif
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
#ifdef DOUBLE_PRECISION_REAL
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_double_real_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_single_real_datatype, cudaMemcpyDeviceToHost)
......@@ -1877,19 +1901,31 @@
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
else
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_real_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error in cudaMemset"
! stop
! endif
!
!#else
else ! l_rows>0
#ifdef WITH_MPI
tmp1(1:l_cols*n_cols) = 0
!#endif
#else
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
#ifdef WITH_GPU_VERSION
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_double_real_datatype)
#else
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemset"
stop
endif
#endif
#endif /* WITH_MPI */
endif ! l_rows>0
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost)
......@@ -1918,7 +1954,10 @@
!#endif
if (l_rows>0) then
#ifdef WITH_MPI
! after the allreduce we have to copy back to the device
#ifdef DOUBLE_PRECISION_REAL
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_single_real_datatype,cudaMemcpyHostToDevice)
......@@ -1927,15 +1966,21 @@
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
! already existend on GPU
!#ifdef DOUBLE_PRECISION_REAL
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
......@@ -1946,15 +1991,17 @@
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
#endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
!#ifdef DOUBLE_PRECISION_REAL
! ! copy to host maybe this can be avoided
! ! this is not necessary hvm is not used anymore
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
!#else
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
......@@ -2108,13 +2155,15 @@
q, ldq, 0.0_rk4, tmp1, n_cols)
#endif
else
else ! l_rows>0
#ifdef DOUBLE_PRECISION_REAL
tmp1(1:l_cols*n_cols) = 0._rk8
#else
tmp1(1:l_cols*n_cols) = 0._rk4
#endif
endif
endif ! l_rows>0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
......@@ -2167,6 +2216,7 @@
stop
endif
#ifdef DOUBLE_PRECISION_REAL
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_double_real_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_single_real_datatype, cudaMemcpyDeviceToHost)
......@@ -3336,11 +3386,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
subroutine trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#else
subroutine trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
subroutine trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#endif
......@@ -3389,6 +3439,9 @@
#else
real(kind=REAL_DATATYPE) :: q(ldq,matrixCols)
#endif
integer(kind=c_intptr_t) :: q_dev
real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
......@@ -3411,12 +3464,12 @@
logical :: flag
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), pointer :: a(:,:,:,:)
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
real(kind=REAL_DATATYPE), pointer :: a(:,:,:)
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:)
#endif
real(kind=REAL_DATATYPE) :: a_real
type(c_ptr) :: a_ptr
type(c_ptr) :: aIntern_ptr
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
......@@ -3438,7 +3491,7 @@
! real*8, allocatable, device :: hh_dot_dev(:)
! real*8, allocatable, device :: hh_tau_dev(:)
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1
......@@ -3588,17 +3641,17 @@
if (useGPU) then
#ifdef DOUBLE_PRECISION_REAL
num = (stripe_width*a_dim2*stripe_count)*size_of_double_real_datatype
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_double_real_datatype)
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*size_of_double_real_datatype)
#else
num = (stripe_width*a_dim2*stripe_count)*size_of_single_real_datatype
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_single_real_datatype)
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(a_dev , 0, num)
successCUDA = cuda_memset(aIntern_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
......@@ -3606,32 +3659,32 @@
else ! GPUs are not used
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif
#ifdef WITH_OPENMP
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a,[stripe_width,a_dim2,stripe_count] )
!allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
!allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:) = 0._rk8
aIntern(:,:,:) = 0._rk8
#else
a(:,:,:) = 0._rk4
aIntern(:,:,:) = 0._rk4
#endif
#endif
......@@ -3715,9 +3768,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:,my_thread) = 0.0_rk8 ! if possible, do first touch allocation!
aIntern(:,:,:,my_thread) = 0.0_rk8 ! if possible, do first touch allocation!
#else
a(:,:,:,my_thread) = 0.0_rk4 ! if possible, do first touch allocation!
aIntern(:,:,:,my_thread) = 0.0_rk4 ! if possible, do first touch allocation!
#endif
enddo
!$omp end parallel do
......@@ -3766,10 +3819,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(ip), my_thread, stripe_count, &
call unpack_row_real_cpu_openmp_double(aIntern, row, i-limits(ip), my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(ip), my_thread, stripe_count, &
call unpack_row_real_cpu_openmp_single(aIntern, row, i-limits(ip), my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
#endif
enddo
......@@ -3788,11 +3841,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#endif
......@@ -3823,9 +3876,9 @@
#endif /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_double(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_real_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_single(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
......@@ -3852,10 +3905,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(ip), my_thread, &
call unpack_row_real_cpu_openmp_double(aIntern, row, i-limits(ip), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(ip), my_thread, &
call unpack_row_real_cpu_openmp_single(aIntern, row, i-limits(ip), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
......@@ -3873,11 +3926,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
......@@ -3885,9 +3938,9 @@
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_double(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_real_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_single(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
......@@ -3966,10 +4019,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(my_prow), my_thread, &
call unpack_row_real_cpu_openmp_double(aIntern, row, i-limits(my_prow), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(my_prow), my_thread, &
call unpack_row_real_cpu_openmp_single(aIntern, row, i-limits(my_prow), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
......@@ -3986,11 +4039,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)