Commit e9d0b2a6 authored by Andreas Marek's avatar Andreas Marek

Some changes for aurora

parent cf9642d3
......@@ -326,13 +326,17 @@ module elpa
integer :: error2
obj => elpa_impl_allocate(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "ELPA not usable with this error"
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
......@@ -341,6 +345,8 @@ module elpa
stop
endif
endif
#endif
end function
......@@ -359,7 +365,9 @@ module elpa
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -368,6 +376,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -377,15 +386,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......@@ -404,7 +419,9 @@ module elpa
#endif
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -413,6 +430,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -422,15 +440,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA autotuning object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......
......@@ -90,7 +90,7 @@
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable , target :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
......@@ -684,7 +684,7 @@
endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, c_loc(qtmp1(1,1)), &
successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
......@@ -749,13 +749,13 @@
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, c_loc(qtmp2(1,1)), &
successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -813,7 +813,7 @@
if(useGPU) then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, c_loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -843,7 +843,7 @@
if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......
......@@ -105,10 +105,10 @@
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout), target :: q_mat(ldq,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
......@@ -120,9 +120,9 @@
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable, target :: tmat(:,:), hvm1(:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -237,7 +237,8 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, c_loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -341,13 +342,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, c_loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, c_loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -385,7 +386,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(c_loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -394,7 +395,7 @@
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, c_loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -451,7 +452,8 @@
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(c_loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -113,12 +113,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout), target :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
#else
MATH_DATATYPE(kind=rck), intent(inout), target :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
#endif
real(kind=rk), intent(out), target :: d_vec(na)
real(kind=rk), intent(out), target :: e_vec(na)
real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
......@@ -149,12 +149,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#if COMPLEXCASE == 1
complex(kind=rck), target :: aux3(1)
complex(kind=rck) :: aux3(1)
#endif
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), allocatable, target :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
MATH_DATATYPE(kind=rck), allocatable :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row(:), &
u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step
......@@ -162,9 +163,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE(kind=rck), allocatable, target :: vu_stored_rows(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable, target :: uv_stored_cols(:,:)
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
......@@ -343,7 +344,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, c_loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif
......@@ -367,9 +369,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(c_loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
......@@ -487,11 +491,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, c_loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, c_loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
......@@ -622,10 +628,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(c_loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(c_loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif
......@@ -750,12 +758,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, c_loc(vu_stored_rows(1,1)), &
successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, c_loc(uv_stored_cols(1,1)), &
successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
......@@ -818,7 +826,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
successCUDA = cuda_memcpy(c_loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
......@@ -839,7 +847,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!successCUDA = cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
successCUDA = cuda_memcpy(a_dev + a_offset, c_loc(a_mat(l_rows, l_cols)), &
successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
endif
......@@ -854,7 +862,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
successCUDA = cuda_memcpy(c_loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1)
......@@ -890,7 +898,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(c_loc(aux3(1)), a_dev, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
......@@ -906,7 +914,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(c_loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU
......@@ -917,7 +925,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(c_loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
......
......@@ -114,11 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), target :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck), target :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), target :: tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
......@@ -141,11 +141,11 @@
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck), target :: vav(nbw,nbw)
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable, target :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
......@@ -363,7 +363,8 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, c_loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
(lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -541,7 +542,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(c_loc(a_mat(1, lc_start)), &
successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -853,7 +854,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -934,7 +935,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), c_loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -1097,7 +1098,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, &
c_loc(vmrCUDA(1)),&
int(loc(vmrCUDA(1)),kind=c_intptr_t),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1107,7 +1108,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
c_loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1169,7 +1170,7 @@
if (useGPU) then
successCUDA = cuda_memcpy( &
c_loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1179,7 +1180,7 @@
endif
successCUDA = cuda_memcpy( &
c_loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1294,7 +1295,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, &
c_loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1302,7 +1303,8 @@
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
successCUDA = cuda_memcpy(tmat_dev,c_loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1316,7 +1318,8 @@
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,int(loc(vav(1,1)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1334,7 +1337,8 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(c_loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(vav(1,1)),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1372,7 +1376,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, c_loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, int(loc(vav(1,1)),kind=c_intptr_t), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1400,7 +1404,7 @@
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
c_loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1420,7 +1424,7 @@
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
c_loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1430,7 +1434,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
c_loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)