Commit d9b473c0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Simplify checking of cuda return codes

parent 88cbfbf3
......@@ -62,7 +62,10 @@
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#endif
#if REALCASE == 1
......
......@@ -157,19 +157,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=c_intptr_t) :: num
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), allocatable :: v_row(:) ! used to store calculated Householder Vector
MATH_DATATYPE(kind=rck), allocatable :: v_row(:), & ! used to store calculated Householder Vector
v_col(:) ! the same Vector, but transposed
MATH_DATATYPE(kind=rck), allocatable :: u_col(:), u_row(:)
MATH_DATATYPE(kind=rck), allocatable :: & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row(:), &
u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE(kind=rck), allocatable :: vu_stored_rows(:,:)
MATH_DATATYPE(kind=rck), pointer :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
......@@ -177,6 +174,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
type(c_ptr) :: v_row_host, v_col_host
type(c_ptr) :: u_row_host, u_col_host
type(c_ptr) :: vu_stored_rows_host, uv_stored_cols_host
real(kind=rk), allocatable :: tmp_real(:)
integer(kind=ik) :: min_tile_size, error
integer(kind=ik) :: istat
......@@ -276,22 +276,30 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
#ifdef WITH_OPENMP
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
......@@ -308,14 +316,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_col = 0
u_col = 0
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successCUDA)
......@@ -398,39 +398,40 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
if (n_stored_vecs > 0 .and. l_rows > 0) then
if (wantDebug) call obj%timer%start("blas")
if (n_stored_vecs > 0 .and. l_rows > 0) then
if (wantDebug) call obj%timer%start("blas")
#if COMPLEXCASE == 1
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
call PRECISION_GEMV('N', &
call PRECISION_GEMV('N', &
int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
#if REALCASE == 1
uv_stored_cols(l_cols+1,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
uv_stored_cols(l_cols+1,1), &
int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
#endif
#if COMPLEXCASE == 1
aux, 1_BLAS_KIND, &
#endif
ONE, v_row, 1_BLAS_KIND)
if (wantDebug) call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif
endif
if(my_prow == prow(istep-1, nblk, np_rows)) then
aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
aux1(2) = v_row(l_rows)
else
aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
aux1(2) = 0.
endif
if (my_prow == prow(istep-1, nblk, np_rows)) then
aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
aux1(2) = v_row(l_rows)
else
aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
......@@ -484,26 +485,26 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder Vector v_row -> v_col
call elpa_transpose_vectors_&
! Transpose Householder Vector v_row -> v_col
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk, max_threads)
! Calculate u = (A + VU**T + UV**T)*v
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! thus the result is partly in u_col(:) and partly in u_row(:)
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! thus the result is partly in u_col(:) and partly in u_row(:)
u_col(1:l_cols) = 0
u_row(1:l_rows) = 0
if (l_rows > 0 .and. l_cols> 0 ) then
if(useGPU) then
u_col(1:l_cols) = 0
u_row(1:l_rows) = 0
if (l_rows > 0 .and. l_cols> 0 ) then
if (useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
......@@ -553,12 +554,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
-ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif
......@@ -575,19 +578,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg, l_col_beg), int(lda,kind=BLAS_KIND), &
v_row(l_row_beg), 1_BLAS_KIND, &
ONE, u_col(l_col_beg), 1_BLAS_KIND)
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, &
ONE, u_col(l_col_beg:max_local_cols), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
1_BLAS_KIND)
else
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, ONE, u_row(l_row_beg:max_local_rows), &
1_BLAS_KIND)
endif
endif
if (wantDebug) call obj%timer%stop("blas")
......@@ -805,7 +810,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
check_memcpy_cuda("tridiag: uv_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * &
......@@ -843,8 +848,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, vu_stored_rows(l_row_beg,1), int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
uv_stored_cols(l_col_beg,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
ONE, vu_stored_rows(l_row_beg:max_local_rows,1:2*max_stored_uv), &
int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
uv_stored_cols(l_col_beg,1), &
int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND))
if (wantDebug) call obj%timer%stop("blas")
endif !useGPU
......@@ -988,7 +995,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
#endif
deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage
stop 1
......@@ -1049,7 +1056,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
print *,"tridiag: error when deallocating tmp_real "//errorMessage
stop 1
endif
deallocate(v_row, v_col, u_row, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage
stop 1
endif
call obj%timer%stop("tridiag_&
&MATH_DATATYPE&
......
......@@ -52,6 +52,15 @@
! Author: A. Marek, MPCDF
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#include "../general/sanity.F90"
use elpa1_compute
......@@ -193,67 +202,34 @@
! copy b to b_dev
num = ldb*ldbCols*size_of_datatype
successCUDA = cuda_malloc(b_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc b_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: b_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(b),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaHostRegister b"
stop
endif
check_host_register_cuda("elpa_mult_at_b: b", successCUDA)
successCUDA = cuda_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
cudaMemcpyHostToDevice)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy, b H2D"
endif
check_memcpy_cuda("elpa_mult_at_b: b to b_dev", successCUDA)
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_malloc_host(aux_host,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost aux"
stop
endif
check_host_alloc_cuda("elpa_mult_at_b: aux_host", successCUDA)
call c_f_pointer(aux_host,aux_mat,(/l_rows,nblk_mult/))
successCUDA = cuda_malloc(aux_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc aux_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: aux_dev", successCUDA)
num = nblk_mult*l_cols*size_of_datatype
successCUDA = cuda_malloc_host(tmp1_host,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost tmp1_host"
stop
endif
check_host_alloc_cuda("elpa_mult_at_b: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host,tmp1,(/nblk_mult,l_cols/))
successCUDA = cuda_malloc(tmp1_dev,num)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc tmp1_dev"
stop
endif
check_alloc_cuda("elpa_mult_at_b: tmp1_dev", successCUDA)
else ! useGPU
allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -265,28 +241,16 @@
endif ! useGPU
allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating aux_bc "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "aux_bc", istat, errorMessage)
allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lrs_save "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "lrs_save", istat, errorMessage)
allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lre_save "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "lre_save", istat, errorMessage)
a_lower = .false.
a_upper = .false.
......@@ -393,24 +357,15 @@
if (lcs<=lce) then
allocate(tmp1(nstor,lcs:lce), tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating tmp1 "//errorMessage
stop
endif
call check_alloc("elpa_mult_at_b_&
&MATH_DATATYPE ", "tmp1", istat, errorMessage)
if (lrs<=lre) then
if (useGPU) then
num = l_rows*nblk_mult*size_of_datatype
successCUDA = cuda_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
num, cudaMemcpyHostToDevice)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy aux_mat H2D"
stop
endif
check_memcpy_cuda("elpa_mult_at_b: aux_mat to aux_dev", successCUDA)
aux_off = (lrs-1)*size_of_datatype
b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
......@@ -424,12 +379,7 @@
num = nstor*(lce-lcs+1)*size_of_datatype
successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp1_dev, num, cudaMemcpyDeviceToHost)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy tmp1 D2H"
stop
endif
check_memcpy_cuda("elpa_mult_at_b: tmp1_dev to tmp1", successCUDA)
else ! useGPU
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
......@@ -478,55 +428,25 @@
if (useGPU) then
successCUDA = cuda_free(b_dev)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree b_dev"
stop
endif
check_dealloc_cuda("elpa_multiply_a_b: b_dev", successCUDA)
successCUDA = cuda_host_unregister(int(loc(b),kind=c_intptr_t))
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaHostUnregister b"
stop
endif
check_host_unregister_cuda("elpa_multiply_a_b: b", successCUDA)
nullify(aux_mat)
nullify(tmp1)
successCUDA = cuda_free_host(aux_host)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFreeHost aux_host"
stop
endif
check_host_dealloc_cuda("elpa_multiply_a_b: aux_host", successCUDA)
successCUDA = cuda_free(aux_dev)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree aux_dev"
stop
endif
check_dealloc_cuda("elpa_multiply_a_b: aux_dev", successCUDA)
successCUDA = cuda_free_host(tmp1_host)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFreeHost tmp1_host"
stop 1
endif
check_host_dealloc_cuda("elpa_multiply_a_b: tmp1_host", successCUDA)
successCUDA = cuda_free(tmp1_dev)
if (.not. successCUDA) then
print *,"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree tmp1_dev"
stop
endif
check_dealloc_cuda("elpa_multiply_a_b: tmp1_dev", successCUDA)
else ! useGPU
deallocate(aux_mat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -61,6 +61,7 @@ module ELPA_utilities
public :: output_unit, error_unit
public :: check_alloc, check_alloc_CUDA_f, check_memcpy_CUDA_f, check_dealloc_CUDA_f
public :: check_host_alloc_CUDA_f, check_host_dealloc_CUDA_f, check_host_register_CUDA_f, check_host_unregister_CUDA_f
public :: map_global_array_index_to_local_index
public :: pcol, prow
public :: local_index ! Get local index of a block cyclic distributed matrix
......@@ -206,4 +207,59 @@ module ELPA_utilities
endif
end subroutine
subroutine check_host_alloc_CUDA_f(file_name, line, successCUDA)
implicit none
character(len=*), intent(in) :: file_name
integer(kind=c_int), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_alloc_host when allocating "
stop 1
endif
end subroutine
subroutine check_host_dealloc_CUDA_f(file_name, line, successCUDA)
implicit none
character(len=*), intent(in) :: file_name
integer(kind=c_int), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_free_host when deallocating "
stop 1
endif
end subroutine
subroutine check_host_register_CUDA_f(file_name, line, successCUDA)
implicit none
character(len=*), intent(in) :: file_name
integer(kind=c_int), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_host_register when registering "
stop 1
endif
end subroutine
subroutine check_host_unregister_CUDA_f(file_name, line, successCUDA)
implicit none
character(len=*), intent(in) :: file_name
integer(kind=c_int), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_host_unregister when unregistering "
stop 1
endif
end subroutine
end module ELPA_utilities