Commit cf9642d3 authored by Andreas Marek's avatar Andreas Marek
Browse files

Change loc to c_loc

parent 718e7b89
......@@ -90,7 +90,7 @@
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
real(kind=REAL_DATATYPE), allocatable , target :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
......@@ -684,7 +684,7 @@
endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
successCUDA = cuda_memcpy(qtmp1_dev, c_loc(qtmp1(1,1)), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
......@@ -749,13 +749,13 @@
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
successCUDA = cuda_memcpy(qtmp2_dev, c_loc(qtmp2(1,1)), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -791,7 +791,7 @@
!TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......@@ -813,7 +813,7 @@
if(useGPU) then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -843,7 +843,7 @@
if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......
......@@ -104,9 +104,11 @@
MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
......@@ -117,8 +119,10 @@
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -233,7 +237,7 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, c_loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -337,13 +341,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, c_loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, c_loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -381,7 +385,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(c_loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -390,7 +394,7 @@
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, c_loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -447,7 +451,7 @@
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -111,14 +111,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout), target :: a_mat(lda,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout), target :: a_mat(lda,matrixCols)
#endif
real(kind=rk), intent(out) :: d_vec(na), e_vec(na)
real(kind=rk), intent(out), target :: d_vec(na)
real(kind=rk), intent(out), target :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
......@@ -149,11 +149,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#if COMPLEXCASE == 1
complex(kind=rck) :: aux3(1)
complex(kind=rck), target :: aux3(1)
#endif
MATH_DATATYPE(kind=rck), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder Vector
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), allocatable, target :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
......@@ -162,9 +162,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE(kind=rck), allocatable :: vu_stored_rows(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
......@@ -343,7 +343,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, c_loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif
......@@ -369,7 +369,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
......@@ -487,11 +487,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_col_dev, c_loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_row_dev, c_loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
......@@ -622,10 +622,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif
......@@ -750,12 +750,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
successCUDA = cuda_memcpy(vu_stored_rows_dev, c_loc(vu_stored_rows(1,1)), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
successCUDA = cuda_memcpy(uv_stored_cols_dev, c_loc(uv_stored_cols(1,1)), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
......@@ -818,7 +818,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(c_loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
......@@ -839,7 +839,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!successCUDA = cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
successCUDA = cuda_memcpy(a_dev + a_offset, c_loc(a_mat(l_rows, l_cols)), &
int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
endif
......@@ -854,7 +854,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(c_loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1)
......@@ -890,7 +890,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
successCUDA = cuda_memcpy(c_loc(aux3(1)), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
......@@ -906,7 +906,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(c_loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU
......@@ -917,7 +917,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
......
......@@ -114,9 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck), target :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck), target :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
......@@ -138,10 +140,12 @@
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck), target :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable, target :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
......@@ -359,7 +363,7 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, c_loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -537,7 +541,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
successCUDA = cuda_memcpy2d(c_loc(a_mat(1, lc_start)), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -849,7 +853,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -930,7 +934,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -1093,7 +1097,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)),&
c_loc(vmrCUDA(1)),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1103,7 +1107,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1165,7 +1169,7 @@
if (useGPU) then
successCUDA = cuda_memcpy( &
loc(vmrCUDA(1)), &
c_loc(vmrCUDA(1)), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1175,7 +1179,7 @@
endif
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1290,7 +1294,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1298,7 +1302,7 @@
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev,c_loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1312,7 +1316,7 @@
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1330,7 +1334,7 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1368,7 +1372,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1396,7 +1400,7 @@
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1416,7 +1420,7 @@
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), &
c_loc(vmrCUDA(1)), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1426,7 +1430,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
c_loc(umcCUDA(1)), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1687,7 +1691,8 @@
! (band to tridi). Previously, a has been kept on the device and then
! copied in redist_band (called from tridiag_band). However, it seems to
! be easier to do it here.
successCUDA = cuda_memcpy (loc(a_mat), int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
successCUDA = cuda_memcpy (c_loc(a_mat), &
int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......
......@@ -700,7 +700,7 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -723,7 +723,7 @@
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, c_loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
......
......@@ -110,9 +110,11 @@
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), q_mat(ldq,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), target :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), target :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
......@@ -122,7 +124,8 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmp1(:), tmp2(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
......@@ -268,7 +271,7 @@
! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop 1
......@@ -281,7 +284,7 @@
! stop 1
! endif
!
! successCUDA = cuda_memcpy(q_dev, loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop 1
......@@ -346,7 +349,7 @@
nb = nb+l_rows
enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(hvm_dev, c_loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
......@@ -369,7 +372,7 @@
! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(c_loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1
......@@ -398,7 +401,7 @@
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
successCUDA = cuda_memcpy(tmp_dev, c_loc(tmp2), n_cols*l_cols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -414,7 +417,7 @@
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, c_loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -434,7 +437,7 @@
! copy to host maybe this can be avoided