Commit df11324c authored by Andreas Marek's avatar Andreas Marek
Browse files

Checking cuda return messages in elpa2_bandred

parent 896e68b5
......@@ -283,32 +283,27 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
successCUDA = cuda_malloc_host(v_row_host,num)
check_alloc_cuda("tridiag: v_row_host", successCUDA)
check_host_alloc_cuda("tridiag: v_row_host", successCUDA)
call c_f_pointer(v_row_host,v_row,(/num/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(v_col_host,num)
check_alloc_cuda("tridiag: v_col_host", successCUDA)
check_host_alloc_cuda("tridiag: v_col_host", successCUDA)
call c_f_pointer(v_col_host,v_col,(/num/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(u_col_host,num)
check_alloc_cuda("tridiag: u_col_host", successCUDA)
check_host_alloc_cuda("tridiag: u_col_host", successCUDA)
call c_f_pointer(u_col_host,u_col,(/num/))
num = (max_local_rows) * size_of_datatype
successCUDA = cuda_malloc_host(u_row_host,num)
check_alloc_cuda("tridiag: u_row_host", successCUDA)
check_host_alloc_cuda("tridiag: u_row_host", successCUDA)
call c_f_pointer(u_row_host,u_row,(/num/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_malloc_host(vu_stored_rows_host,num)
check_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA)
check_host_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA)
call c_f_pointer(vu_stored_rows_host,vu_stored_rows,(/max_local_rows,2*max_stored_uv/))
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
......
......@@ -469,5 +469,4 @@
&MATH_DATATYPE&
&_&
&PRECISION&
&")
&"//gpuString)
......@@ -288,20 +288,10 @@
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc a_dev 1"
stop 1
endif
check_alloc_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc vav_dev 1"
stop 1
endif
check_alloc_cuda("bandred: vav_dev", successCUDA)
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -390,32 +380,17 @@
successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaHostRegisterDefault)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaHostRegister a_mat"
stop 1
endif
check_host_register_cuda("bandred: a_mat", successCUDA)
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy a_dev 2"
stop 1
endif
check_memcpy_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
check_alloc_cuda("bandred: tmat_dev", successCUDA)
istep = (na-1)/nbw
n_cols = min(na,(istep+1)*nbw)-istep*nbw
......@@ -448,40 +423,18 @@
endif
successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMallocHost vmr_host"
stop 1
endif
check_host_alloc_cuda("bandred: vmr_host", successCUDA)
call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/))
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc: vmr_dev2"
stop 1
endif
check_alloc_cuda("bandred: vmr_dev", successCUDA)
successCUDA = cuda_malloc_host(umc_host,umc_size*size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMallocHost umc_host"
stop 1
endif
check_host_alloc_cuda("bandred: umc_host", successCUDA)
call c_f_pointer(umc_host, umcCUDA, (/umc_size/))
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc umc_dev 2"
stop 1
endif
check_alloc_cuda("bandred: umc_dev", successCUDA)
endif ! useGPU
......@@ -569,12 +522,7 @@
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy2d"
stop 1
endif
check_memcpy_cuda("bandred: a_dev -> a_mat", successCUDA)
endif
endif ! useGPU
......@@ -873,12 +821,7 @@
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop 1
endif
check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA)
endif
endif
......@@ -954,12 +897,7 @@
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop 1
endif
check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA)
endif
endif
......@@ -1119,40 +1057,20 @@
if (useGPU) then
successCUDA = cuda_memset(vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
0, cur_l_rows*n_cols*size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemset vmr_dev 3"
stop 1
endif
check_memset_cuda("bandred: vmr_dev", successCUDA)
successCUDA = cuda_memcpy(vmr_dev, int(loc(vmrCUDA(1)),kind=c_intptr_t), &
cur_l_rows*n_cols*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy vmr_dev 3"
stop 1
endif
check_memcpy_cuda("bandred: vmrCUDA -> vmr_dev", successCUDA)
successCUDA = cuda_memset(umc_dev, 0, l_cols*n_cols*size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemset umc_dev 3"
stop 1
endif
check_memset_cuda("bandred: umc_dev", successCUDA)
successCUDA = cuda_memcpy(umc_dev+l_cols*n_cols*size_of_datatype, &
int(loc(umcCUDA(1+l_cols*n_cols)),kind=c_intptr_t), &
(umc_size-l_cols*n_cols)*size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy umc_dev 3"
stop 1
endif
check_memcpy_cuda("bandred: umcCUDA -> umc_dev", successCUDA)
endif ! useGPU
do i=0,(istep*nbw-1)/tile_size
......@@ -1233,22 +1151,12 @@
successCUDA = cuda_memcpy(int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), &
vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
(vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy vmr_dev 4"
stop 1
endif
check_memcpy_cuda("bandred: vmr_dev -> vmrCUDA", successCUDA)
endif
successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy umc_dev 4"
stop 1
endif
check_memcpy_cuda("bandred: umc_dev -> umcCUDA", successCUDA)
endif ! useGPU
endif ! l_cols>0 .and. l_rows>0
......@@ -1351,21 +1259,11 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, int(loc(umcCUDA(1)),kind=c_intptr_t), &
l_cols*n_cols*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
check_memcpy_cuda("bandred: umcCUDA -> umc_dev ", successCUDA)
successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy tmat_dev 2"
stop 1
endif
check_memcpy_cuda("bandred: tmat -> tmat_dev ", successCUDA)
call obj%timer%start("cublas")
call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
......@@ -1385,12 +1283,7 @@
successCUDA = cuda_memcpy(int(loc(vav),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy vav_dev3"
stop 1
endif
check_memcpy_cuda("bandred: vav_dev -> vav ", successCUDA)
else ! useGPU
call obj%timer%start("blas")
......@@ -1438,12 +1331,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, int(loc(vav),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy vav_dev4"
stop 1
endif
check_memcpy_cuda("bandred: vav -> vav_dev ", successCUDA)
endif
! U = U - 0.5 * V * VAV
......@@ -1479,12 +1367,7 @@
successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy umc_dev 6"
stop 1
endif
check_memcpy_cuda("bandred: umc_dev -> umcCUDA ", successCUDA)
! Transpose umc -> umr (stored in vmr, second half)
if (isSkewsymmetric) then
......@@ -1508,12 +1391,7 @@
successCUDA = cuda_memcpy(vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), &
(vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy vmr_dev 5 "
stop 1
endif
check_memcpy_cuda("bandred: vmr -> vmrCUDA ", successCUDA)
else ! useGPU
call obj%timer%start("blas")
......@@ -1675,83 +1553,38 @@
int(a_dev,kind=c_intptr_t), &
int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop 1
endif
check_memcpy_cuda("bandred: a_dev -> a_mat ", successCUDA)
successCUDA = cuda_host_unregister(int(loc(a_mat),kind=c_intptr_t))
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaHostUnregister a_mat"
stop 1
endif
check_host_unregister_cuda("bandred: a_mat ", successCUDA)
successCUDA = cuda_free(a_dev)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree a_dev"
stop 1
endif
check_dealloc_cuda("bandred: a_dev ", successCUDA)
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree vav_dev 4"
stop 1
endif
check_dealloc_cuda("bandred: vav_dev ", successCUDA)
successCUDA = cuda_free(tmat_dev)
if (.not. successCUDA) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree tmat_dev"
stop 1
endif
check_dealloc_cuda("bandred: tmat_dev ", successCUDA)
if (associated(umcCUDA)) then
nullify(umcCUDA)
successCUDA = cuda_free_host(umc_host)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaFreeHost umc_host"
stop 1
endif
check_host_dealloc_cuda("bandred: umc_host ", successCUDA)
successCUDA = cuda_free(umc_dev)
if (.not. successCUDA) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree umc_dev 8"
stop
endif
check_dealloc_cuda("bandred: umc_dev ", successCUDA)
endif
if (associated(vmrCUDA)) then
nullify(vmrCUDA)
successCUDA = cuda_free_host(vmr_host)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaFreeHost vmr_host"
stop 1
endif
check_host_dealloc_cuda("bandred: vmr_host ", successCUDA)
successCUDA = cuda_free(vmr_dev)
if (.not. successCUDA) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree vmr_dev 6"
stop 1
endif
check_dealloc_cuda("bandred: vmr_dev ", successCUDA)
endif
endif ! useGPU
......
......@@ -55,6 +55,19 @@
#include "../general/sanity.F90"
#if REALCASE == 1
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#define check_memset_cuda(file, success) call check_memset_CUDA_f(file, __LINE__, success)
#endif
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa2_bandred_template.F90"
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
......@@ -49,6 +50,7 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
#include "elpa/elpa_simd_constants.h"
......
......@@ -62,6 +62,7 @@ module ELPA_utilities
public :: output_unit, error_unit
public :: check_alloc, check_alloc_CUDA_f, check_memcpy_CUDA_f, check_dealloc_CUDA_f
public :: check_host_alloc_CUDA_f, check_host_dealloc_CUDA_f, check_host_register_CUDA_f, check_host_unregister_CUDA_f
public :: check_memset_cuda_f
public :: map_global_array_index_to_local_index
public :: pcol, prow
public :: local_index ! Get local index of a block cyclic distributed matrix
......@@ -262,4 +263,18 @@ module ELPA_utilities
stop 1
endif
end subroutine
subroutine check_memset_CUDA_f(file_name, line, successCUDA)
implicit none
character(len=*), intent(in) :: file_name
integer(kind=c_int), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_memset "
stop 1
endif
end subroutine
end module ELPA_utilities
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment