Commit cf9642d3 authored by Andreas Marek's avatar Andreas Marek

Change loc to c_loc

parent 718e7b89
......@@ -90,7 +90,7 @@
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
real(kind=REAL_DATATYPE), allocatable , target :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
......@@ -684,7 +684,7 @@
endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
successCUDA = cuda_memcpy(qtmp1_dev, c_loc(qtmp1(1,1)), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
......@@ -749,13 +749,13 @@
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
successCUDA = cuda_memcpy(qtmp2_dev, c_loc(qtmp2(1,1)), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -791,7 +791,7 @@
!TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......@@ -813,7 +813,7 @@
if(useGPU) then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -843,7 +843,7 @@
if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......
......@@ -104,9 +104,11 @@
MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
......@@ -117,8 +119,10 @@
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -233,7 +237,7 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, c_loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -337,13 +341,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, c_loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, c_loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -381,7 +385,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(c_loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -390,7 +394,7 @@
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, c_loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -447,7 +451,7 @@
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
This diff is collapsed.
......@@ -114,9 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck), target :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck), target :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
......@@ -138,10 +140,12 @@
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck), target :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable, target :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
......@@ -359,7 +363,7 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, c_loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -537,7 +541,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
successCUDA = cuda_memcpy2d(c_loc(a_mat(1, lc_start)), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -849,7 +853,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -930,7 +934,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -1093,7 +1097,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)),&
c_loc(vmrCUDA(1)),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1103,7 +1107,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1165,7 +1169,7 @@
if (useGPU) then
successCUDA = cuda_memcpy( &
loc(vmrCUDA(1)), &
c_loc(vmrCUDA(1)), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1175,7 +1179,7 @@
endif
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1290,7 +1294,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1298,7 +1302,7 @@
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev,c_loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1312,7 +1316,7 @@
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1330,7 +1334,7 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1368,7 +1372,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1396,7 +1400,7 @@
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1416,7 +1420,7 @@
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), &
c_loc(vmrCUDA(1)), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1426,7 +1430,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1687,7 +1691,8 @@
! (band to tridi). Previously, a has been kept on the device and then
! copied in redist_band (called from tridiag_band). However, it seems to
! be easier to do it here.
successCUDA = cuda_memcpy (loc(a_mat), int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
successCUDA = cuda_memcpy (c_loc(a_mat), &
int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......
......@@ -700,7 +700,7 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -723,7 +723,7 @@
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, c_loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
......
......@@ -110,9 +110,11 @@
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), q_mat(ldq,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), target :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), target :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
......@@ -122,7 +124,8 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmp1(:), tmp2(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
......@@ -268,7 +271,7 @@
! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop 1
......@@ -281,7 +284,7 @@
! stop 1
! endif
!
! successCUDA = cuda_memcpy(q_dev, loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop 1
......@@ -346,7 +349,7 @@
nb = nb+l_rows
enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(hvm_dev, c_loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
......@@ -369,7 +372,7 @@
! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1
......@@ -398,7 +401,7 @@
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
successCUDA = cuda_memcpy(tmp_dev, c_loc(tmp2), n_cols*l_cols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -414,7 +417,7 @@
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, c_loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -434,7 +437,7 @@
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1
......@@ -779,7 +782,7 @@
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......
......@@ -108,9 +108,9 @@
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: q(ldq,*)
MATH_DATATYPE(kind=rck), target :: q(ldq,*)
#else
MATH_DATATYPE(kind=rck) :: q(ldq,matrixCols)
MATH_DATATYPE(kind=rck), target :: q(ldq,matrixCols)
#endif
MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
......@@ -143,11 +143,15 @@
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_recv_buffer(:,:)
#else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
......@@ -164,7 +168,7 @@
integer(kind=ik) :: top, chunk, this_chunk
MATH_DATATYPE(kind=rck), allocatable :: result_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bcast_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
......@@ -1234,7 +1238,7 @@
#endif /* WITH_MPI */
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
successCUDA = cuda_memcpy(bcast_buffer_dev, c_loc(bcast_buffer(1,1)), &
nbw * current_local_n * &
size_of_datatype, &
cudaMemcpyHostToDevice)
......@@ -1345,7 +1349,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , c_loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1430,7 +1434,7 @@
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , c_loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1523,7 +1527,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( c_loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1634,7 +1638,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( c_loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1730,7 +1734,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , c_loc( top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length * size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1858,7 +1862,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( c_loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -2205,7 +2209,7 @@
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
successCUDA = cuda_memcpy(q_dev, c_loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
......
......@@ -72,7 +72,7 @@
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
if(mod(loc(q),16) /= 0) STOP 'Q unaligned!'
if(mod(c_loc(q),16) /= 0) STOP 'Q unaligned!'
! Calculate dot product of the two Householder vectors
......
......@@ -63,7 +63,7 @@
real(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND):: rows(:,:)
complex(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
integer(kind=ik) :: max_idx
logical :: successCUDA
......@@ -117,10 +117,10 @@
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
integer(kind=ik) :: max_idx
......
......@@ -38,7 +38,7 @@ module matrix_plot
integer(kind=ik) :: row, col, mpi_rank
integer(kind=ik), save :: counter = 0
real(kind=REAL_DATATYPE) :: a_dev_helper(lda,matrixCols)
real(kind=REAL_DATATYPE), target :: a_dev_helper(lda,matrixCols)
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_double_real
......@@ -59,7 +59,7 @@ module matrix_plot
! print a_dev
if(useGpu) then
successCUDA = cuda_memcpy(loc(a_dev_helper(1,1)), a_dev, lda * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(a_dev_helper(1,1)), a_dev, lda * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
write(filename, "(A,A,I0.4,A,I0.2,A)") trim(directory), "/a_dev-", counter, "-", mpi_rank, ".txt"
write(*,*) trim(filename)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment