Commit fe2e6726 authored by Andreas Marek's avatar Andreas Marek
Browse files

Rename check_*_CUDA -> check_*_GPU

parent f515b7b2
......@@ -243,44 +243,44 @@ subroutine trans_ev_&
!&", "hvm1", istat, errorMessage)
num = (max_local_rows*max_stored_rows) * size_of_datatype
successGPU = gpu_malloc_host(hvm1_host,num)
check_alloc_cuda("trans_ev: hvm1_host", successGPU)
check_alloc_gpu("trans_ev: hvm1_host", successGPU)
call c_f_pointer(hvm1_host,hvm1,(/(max_local_rows*max_stored_rows)/))
num = (max_stored_rows*max_stored_rows) * size_of_datatype
successGPU = gpu_malloc_host(tmat_host,num)
check_alloc_cuda("trans_ev: tmat_host", successGPU)
check_alloc_gpu("trans_ev: tmat_host", successGPU)
call c_f_pointer(tmat_host,tmat,(/max_stored_rows,max_stored_rows/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successGPU = gpu_malloc_host(tmp1_host,num)
check_alloc_cuda("trans_ev: tmp1_host", successGPU)
check_alloc_gpu("trans_ev: tmp1_host", successGPU)
call c_f_pointer(tmp1_host,tmp1,(/(max_local_cols*max_stored_rows)/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successGPU = gpu_malloc_host(tmp2_host,num)
check_alloc_cuda("trans_ev: tmp2_host", successGPU)
check_alloc_gpu("trans_ev: tmp2_host", successGPU)
call c_f_pointer(tmp2_host,tmp2,(/(max_local_cols*max_stored_rows)/))
successGPU = gpu_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successGPU)
check_alloc_gpu("trans_ev", successGPU)
successGPU = gpu_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successGPU)
check_alloc_gpu("trans_ev", successGPU)
successGPU = gpu_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successGPU)
check_alloc_gpu("trans_ev", successGPU)
num = ldq * matrixCols * size_of_datatype
successGPU = gpu_malloc(q_dev, num)
check_alloc_cuda("trans_ev", successGPU)
check_alloc_gpu("trans_ev", successGPU)
successGPU = gpu_host_register(int(loc(q_mat),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("trans_ev: q_mat", successGPU)
check_host_register_gpu("trans_ev: q_mat", successGPU)
successGPU = gpu_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
endif ! useGPU
do istep = 1, na, blockStep
......@@ -390,12 +390,12 @@ subroutine trans_ev_&
successGPU = gpu_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
!tmat_dev = tmat
successGPU = gpu_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
endif
! Q = Q - V * T * V**T * Q
......@@ -422,7 +422,7 @@ subroutine trans_ev_&
if (useGPU) then
successGPU = gpu_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
else
tmp1(1:l_cols*nstor) = 0
endif
......@@ -434,7 +434,7 @@ subroutine trans_ev_&
if (useGPU) then
successGPU = gpu_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
endif
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, int(nstor*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
......@@ -444,7 +444,7 @@ subroutine trans_ev_&
if (useGPU) then
successGPU = gpu_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
endif ! useGPU
......@@ -500,25 +500,25 @@ subroutine trans_ev_&
!q_mat = q_dev
successGPU = gpu_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successGPU)
check_memcpy_gpu("trans_ev", successGPU)
successGPU = gpu_host_unregister(int(loc(q_mat),kind=c_intptr_t))
check_host_unregister_cuda("trans_ev: q_mat", successGPU)
check_host_unregister_gpu("trans_ev: q_mat", successGPU)
successGPU = gpu_free_host(hvm1_host)
check_host_dealloc_cuda("trans_ev: hvm1_host", successGPU)
check_host_dealloc_gpu("trans_ev: hvm1_host", successGPU)
nullify(hvm1)
successGPU = gpu_free_host(tmat_host)
check_host_dealloc_cuda("trans_ev: tmat_host", successGPU)
check_host_dealloc_gpu("trans_ev: tmat_host", successGPU)
nullify(tmat)
successGPU = gpu_free_host(tmp1_host)
check_host_dealloc_cuda("trans_ev: tmp1_host", successGPU)
check_host_dealloc_gpu("trans_ev: tmp1_host", successGPU)
nullify(tmp1)
successGPU = gpu_free_host(tmp2_host)
check_host_dealloc_cuda("trans_ev: tmp2_host", successGPU)
check_host_dealloc_gpu("trans_ev: tmp2_host", successGPU)
nullify(tmp2)
!deallocate(hvm1, stat=istat, errmsg=errorMessage)
......@@ -531,16 +531,16 @@ subroutine trans_ev_&
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successGPU = gpu_free(q_dev)
check_dealloc_cuda("trans_ev", successGPU)
check_dealloc_gpu("trans_ev", successGPU)
successGPU = gpu_free(tmp_dev)
check_dealloc_cuda("trans_ev", successGPU)
check_dealloc_gpu("trans_ev", successGPU)
successGPU = gpu_free(hvm_dev)
check_dealloc_cuda("trans_ev", successGPU)
check_dealloc_gpu("trans_ev", successGPU)
successGPU = gpu_free(tmat_dev)
check_dealloc_cuda("trans_ev", successGPU)
check_dealloc_gpu("trans_ev", successGPU)
else
deallocate(tmat, tmp1, tmp2, stat=istat, errmsg=errorMessage)
check_deallocate("trans_ev_&
......
......@@ -291,33 +291,33 @@ subroutine tridiag_&
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
successGPU = gpu_malloc_host(v_row_host, num)
check_host_alloc_cuda("tridiag: v_row_host", successGPU)
check_host_alloc_gpu("tridiag: v_row_host", successGPU)
call c_f_pointer(v_row_host,v_row,(/(max_local_rows+1)/))
num = (max_local_cols) * size_of_datatype
successGPU = gpu_malloc_host(v_col_host,num)
check_host_alloc_cuda("tridiag: v_col_host", successGPU)
check_host_alloc_gpu("tridiag: v_col_host", successGPU)
call c_f_pointer(v_col_host,v_col,(/(max_local_cols)/))
num = (max_local_cols) * size_of_datatype
successGPU = gpu_malloc_host(u_col_host,num)
check_host_alloc_cuda("tridiag: u_col_host", successGPU)
check_host_alloc_gpu("tridiag: u_col_host", successGPU)
call c_f_pointer(u_col_host,u_col,(/(max_local_cols)/))
num = (max_local_rows) * size_of_datatype
successGPU = gpu_malloc_host(u_row_host,num)
check_host_alloc_cuda("tridiag: u_row_host", successGPU)
check_host_alloc_gpu("tridiag: u_row_host", successGPU)
call c_f_pointer(u_row_host,u_row,(/(max_local_rows)/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successGPU = gpu_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: vu_stored_roes", successGPU)
check_host_register_gpu("tridiag: vu_stored_roes", successGPU)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
successGPU = gpu_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successGPU)
check_host_register_gpu("tridiag: uv_stored_cols", successGPU)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num = na * 8
......@@ -326,7 +326,7 @@ subroutine tridiag_&
#endif
successGPU = gpu_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: e_vec", successGPU)
check_host_register_gpu("tridiag: e_vec", successGPU)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num = na * 8
......@@ -335,7 +335,7 @@ subroutine tridiag_&
#endif
successGPU = gpu_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: d_vec", successGPU)
check_host_register_gpu("tridiag: d_vec", successGPU)
else
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
......@@ -374,23 +374,23 @@ subroutine tridiag_&
if (useGPU) then
successGPU = gpu_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successGPU)
check_alloc_gpu("tridiag: v_row_dev", successGPU)
successGPU = gpu_malloc(u_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: u_row_dev", successGPU)
check_alloc_gpu("tridiag: u_row_dev", successGPU)
successGPU = gpu_malloc(v_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: v_col_dev", successGPU)
check_alloc_gpu("tridiag: v_col_dev", successGPU)
successGPU = gpu_malloc(u_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: u_col_dev", successGPU)
check_alloc_gpu("tridiag: u_col_dev", successGPU)
successGPU = gpu_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successGPU)
check_alloc_gpu("tridiag: vu_stored_rows_dev", successGPU)
successGPU = gpu_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successGPU)
check_alloc_gpu("tridiag: vu_stored_rows_dev", successGPU)
endif !useGPU
......@@ -417,15 +417,15 @@ subroutine tridiag_&
num = matrixRows * matrixCols * size_of_datatype
successGPU = gpu_malloc(a_dev, num)
check_alloc_cuda("tridiag: a_dev", successGPU)
check_alloc_gpu("tridiag: a_dev", successGPU)
successGPU = gpu_host_register(int(loc(a_mat),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: a_mat", successGPU)
check_host_register_gpu("tridiag: a_mat", successGPU)
successGPU = gpu_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successGPU)
check_memcpy_gpu("tridiag: a_dev", successGPU)
endif
! main cycle of tridiagonalization
......@@ -453,7 +453,7 @@ subroutine tridiag_&
successGPU = gpu_memcpy(int(loc(v_row),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successGPU)
check_memcpy_gpu("tridiag a_dev 1", successGPU)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
......@@ -565,19 +565,19 @@ subroutine tridiag_&
if (l_rows > 0 .and. l_cols> 0 ) then
if (useGPU) then
successGPU = gpu_memset(u_col_dev, 0, l_cols * size_of_datatype)
check_memcpy_cuda("tridiag: u_col_dev", successGPU)
check_memcpy_gpu("tridiag: u_col_dev", successGPU)
successGPU = gpu_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successGPU)
check_memcpy_gpu("tridiag: u_row_dev", successGPU)
successGPU = gpu_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successGPU)
check_memcpy_gpu("tridiag: v_col_dev", successGPU)
successGPU = gpu_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
l_rows * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successGPU)
check_memcpy_gpu("tridiag: v_row_dev", successGPU)
endif ! useGU
#ifdef WITH_OPENMP_TRADITIONAL
......@@ -743,11 +743,11 @@ subroutine tridiag_&
successGPU = gpu_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successGPU)
check_memcpy_gpu("tridiag: u_col_dev 1", successGPU)
successGPU = gpu_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successGPU)
check_memcpy_gpu("tridiag: u_row_dev 1", successGPU)
endif ! useGPU
#ifdef WITH_OPENMP_TRADITIONAL
......@@ -876,12 +876,12 @@ subroutine tridiag_&
successGPU = gpu_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_rows_dev", successGPU)
check_memcpy_gpu("tridiag: uv_stored_rows_dev", successGPU)
successGPU = gpu_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successGPU)
check_memcpy_gpu("tridiag: uv_stored_cols_dev", successGPU)
endif
do i = 0, (istep-2)/tile_size
......@@ -946,7 +946,7 @@ subroutine tridiag_&
successGPU = gpu_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
1 * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successGPU)
check_memcpy_gpu("tridiag: a_dev 3", successGPU)
endif
if (n_stored_vecs > 0) then
......@@ -967,11 +967,11 @@ subroutine tridiag_&
if (useGPU) then
!a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
!successGPU = cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", successGPU)
!check_memcpy_gpu("tridiag: a_dev 4a5a", successGPU)
successGPU = gpu_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
int(1 * size_of_datatype, kind=c_intptr_t), gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev 4", successGPU)
check_memcpy_gpu("tridiag: a_dev 4", successGPU)
endif
endif
......@@ -986,7 +986,7 @@ subroutine tridiag_&
if (useGPU) then
successGPU = gpu_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successGPU)
check_memcpy_gpu("tridiag: a_dev 5", successGPU)
vrl = aux3(1)
else !useGPU
vrl = a_mat(1,l_cols)
......@@ -1022,7 +1022,7 @@ subroutine tridiag_&
if (useGPU) then
successGPU = gpu_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successGPU)
check_memcpy_gpu("tridiag: a_dev 6", successGPU)
d_vec(1) = PRECISION_REAL(aux3(1))
else !useGPU
d_vec(1) = PRECISION_REAL(a_mat(1,1))
......@@ -1038,7 +1038,7 @@ subroutine tridiag_&
if (useGPU) then
successGPU = gpu_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successGPU)
check_memcpy_gpu("tridiag: a_dev 7", successGPU)
else !useGPU
e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
endif !useGPU
......@@ -1048,7 +1048,7 @@ subroutine tridiag_&
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successGPU = gpu_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successGPU)
check_memcpy_gpu("tridiag: a_dev 8", successGPU)
else !useGPU
if (isSkewsymmetric) then
d_vec(1) = 0.0_rk
......@@ -1065,25 +1065,25 @@ subroutine tridiag_&
if (useGPU) then
! todo: should we leave a_mat on the device for further use?
successGPU = gpu_free(a_dev)
check_dealloc_cuda("tridiag: a_dev 9", successGPU)
check_dealloc_gpu("tridiag: a_dev 9", successGPU)
successGPU = gpu_free(v_row_dev)
check_dealloc_cuda("tridiag: v_row_dev", successGPU)
check_dealloc_gpu("tridiag: v_row_dev", successGPU)
successGPU = gpu_free(u_row_dev)
check_dealloc_cuda("tridiag: (u_row_dev", successGPU)
check_dealloc_gpu("tridiag: (u_row_dev", successGPU)
successGPU = gpu_free(v_col_dev)
check_dealloc_cuda("tridiag: v_col_dev", successGPU)
check_dealloc_gpu("tridiag: v_col_dev", successGPU)
successGPU = gpu_free(u_col_dev)
check_dealloc_cuda("tridiag: u_col_dev ", successGPU)
check_dealloc_gpu("tridiag: u_col_dev ", successGPU)
successGPU = gpu_free(vu_stored_rows_dev)
check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successGPU)
check_dealloc_gpu("tridiag: vu_stored_rows_dev ", successGPU)
successGPU = gpu_free(uv_stored_cols_dev)
check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successGPU)
check_dealloc_gpu("tridiag:uv_stored_cols_dev ", successGPU)
endif
! distribute the arrays d_vec and e_vec to all processors
......@@ -1113,35 +1113,35 @@ subroutine tridiag_&
if (useGPU) then
successGPU = gpu_host_unregister(int(loc(a_mat),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: a_mat", successGPU)
check_host_unregister_gpu("tridiag: a_mat", successGPU)
successGPU = gpu_free_host(v_row_host)
check_host_dealloc_cuda("tridiag: v_row_host", successGPU)
check_host_dealloc_gpu("tridiag: v_row_host", successGPU)
nullify(v_row)
successGPU = gpu_free_host(v_col_host)
check_host_dealloc_cuda("tridiag: v_col_host", successGPU)
check_host_dealloc_gpu("tridiag: v_col_host", successGPU)
nullify(v_col)
successGPU = gpu_free_host(u_col_host)
check_host_dealloc_cuda("tridiag: u_col_host", successGPU)
check_host_dealloc_gpu("tridiag: u_col_host", successGPU)
nullify(u_col)
successGPU = gpu_free_host(u_row_host)
check_host_dealloc_cuda("tridiag: u_row_host", successGPU)
check_host_dealloc_gpu("tridiag: u_row_host", successGPU)
nullify(u_row)
successGPU = gpu_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: uv_stored_cols", successGPU)
check_host_unregister_gpu("tridiag: uv_stored_cols", successGPU)
successGPU = gpu_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: vu_stored_rows", successGPU)
check_host_unregister_gpu("tridiag: vu_stored_rows", successGPU)
successGPU = gpu_host_unregister(int(loc(e_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: e_vec", successGPU)
check_host_unregister_gpu("tridiag: e_vec", successGPU)
successGPU = gpu_host_unregister(int(loc(d_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: d_vec", successGPU)
check_host_unregister_gpu("tridiag: d_vec", successGPU)
else
deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage)
check_deallocate("tridiag: v_row, v_col, u_row, u_col", istat, errorMessage)
......
......@@ -202,34 +202,34 @@
! copy b to b_dev
num = ldb*ldbCols*size_of_datatype
successGPU = gpu_malloc(b_dev,num)
check_alloc_cuda("elpa_mult_at_b: b_dev", successGPU)
check_alloc_gpu("elpa_mult_at_b: b_dev", successGPU)
successGPU = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("elpa_mult_at_b: b", successGPU)
check_host_register_gpu("elpa_mult_at_b: b", successGPU)
successGPU = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
cudaMemcpyHostToDevice)
check_memcpy_cuda("elpa_mult_at_b: b to b_dev", successGPU)
check_memcpy_gpu("elpa_mult_at_b: b to b_dev", successGPU)
num = l_rows*nblk_mult*size_of_datatype
successGPU = gpu_malloc_host(aux_host,num)
check_host_alloc_cuda("elpa_mult_at_b: aux_host", successGPU)
check_host_alloc_gpu("elpa_mult_at_b: aux_host", successGPU)
call c_f_pointer(aux_host,aux_mat,(/l_rows,nblk_mult/))
successGPU = gpu_malloc(aux_dev,num)
check_alloc_cuda("elpa_mult_at_b: aux_dev", successGPU)
check_alloc_gpu("elpa_mult_at_b: aux_dev", successGPU)
num = nblk_mult*l_cols*size_of_datatype
successGPU = gpu_malloc_host(tmp1_host,num)
check_host_alloc_cuda("elpa_mult_at_b: tmp1_host", successGPU)
check_host_alloc_gpu("elpa_mult_at_b: tmp1_host", successGPU)
call c_f_pointer(tmp1_host,tmp1,(/nblk_mult,l_cols/))
successGPU = gpu_malloc(tmp1_dev,num)
check_alloc_cuda("elpa_mult_at_b: tmp1_dev", successGPU)
check_alloc_gpu("elpa_mult_at_b: tmp1_dev", successGPU)
else ! useGPU
allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
check_allocate("elpa_mult_at_b: aux_mat", istat, errorMessage)
......@@ -357,7 +357,7 @@
num = l_rows*nblk_mult*size_of_datatype
successGPU = gpu_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("elpa_mult_at_b: aux_mat to aux_dev", successGPU)
check_memcpy_gpu("elpa_mult_at_b: aux_mat to aux_dev", successGPU)
aux_off = (lrs-1)*size_of_datatype
b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
......@@ -371,7 +371,7 @@
num = nstor*(lce-lcs+1)*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp1_dev, num, gpuMemcpyDeviceToHost)
check_memcpy_cuda("elpa_mult_at_b: tmp1_dev to tmp1", successGPU)
check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successGPU)
else ! useGPU
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
......@@ -414,25 +414,25 @@
if (useGPU) then
successGPU = gpu_free(b_dev)
check_dealloc_cuda("elpa_multiply_a_b: b_dev", successGPU)
check_dealloc_gpu("elpa_multiply_a_b: b_dev", successGPU)
successGPU = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
check_host_unregister_cuda("elpa_multiply_a_b: b", successGPU)
check_host_unregister_gpu("elpa_multiply_a_b: b", successGPU)
nullify(aux_mat)
nullify(tmp1)
successGPU = gpu_free_host(aux_host)
check_host_dealloc_cuda("elpa_multiply_a_b: aux_host", successGPU)
check_host_dealloc_gpu("elpa_multiply_a_b: aux_host", successGPU)
successGPU = gpu_free(aux_dev)
check_dealloc_cuda("elpa_multiply_a_b: aux_dev", successGPU)
check_dealloc_gpu("elpa_multiply_a_b: aux_dev", successGPU)
successGPU = gpu_free_host(tmp1_host)
check_host_dealloc_cuda("elpa_multiply_a_b: tmp1_host", successGPU)
check_host_dealloc_gpu("elpa_multiply_a_b: tmp1_host", successGPU)
successGPU = gpu_free(tmp1_dev)
check_dealloc_cuda("elpa_multiply_a_b: tmp1_dev", successGPU)
check_dealloc_gpu("elpa_multiply_a_b: tmp1_dev", successGPU)
else ! useGPU
deallocate(aux_mat, stat=istat, errmsg=errorMessage)
check_deallocate("elpa_mult_at_b: aux_mat", istat, errorMessage)
......
......@@ -290,15 +290,15 @@ max_threads)
! Here we convert the regular host array into a pinned host array
successGPU = gpu_malloc(a_dev, lda*na_cols* size_of_datatype)
check_alloc_cuda("bandred: a_dev", successGPU)
check_alloc_gpu("bandred: a_dev", successGPU)
successGPU = gpu_host_register(int(loc(vav),kind=c_intptr_t), &
nbw * nbw * size_of_datatype,&
gpuHostRegisterDefault)
check_host_register_cuda("bandred: vav", successGPU)
check_host_register_gpu("bandred: vav", successGPU)
successGPU = gpu_malloc(vav_dev, nbw*nbw* size_of_datatype)
check_alloc_cuda("bandred: vav_dev", successGPU)
check_alloc_gpu("bandred: vav_dev", successGPU)
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -373,17 +373,17 @@ max_threads)
successGPU = gpu_host_register(int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, gpuHostRegisterDefault)
check_host_register_cuda("bandred: a_mat", successGPU)
check_host_register_gpu("bandred: a_mat", successGPU)
cur_l_rows = 0
cur_l_cols = 0
successGPU = gpu_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("bandred: a_dev", successGPU)
check_memcpy_gpu("bandred: a_dev", successGPU)
successGPU = gpu_malloc(tmat_dev, nbw*nbw*size_of_datatype)
check_alloc_cuda("bandred: tmat_dev", successGPU)
check_alloc_gpu("bandred: tmat_dev", successGPU)
istep = (na-1)/nbw
blk_end = (na-1)/nbw
......@@ -417,18 +417,18 @@ max_threads)
endif