Commit 63e9e612 authored by Andreas Marek's avatar Andreas Marek

Some intendation

parent 1daaa172
......@@ -60,7 +60,7 @@
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine bandred_&
subroutine bandred_&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -100,1090 +100,1092 @@
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
use elpa1_compute
use cuda_functions
use iso_c_binding
use elpa1_compute
#ifdef WITH_OPENMP
use omp_lib
use omp_lib
#endif
use precision
use elpa_blas_interfaces
use precision
use elpa_blas_interfaces
#ifdef WITH_MPI
use elpa_scalapack_interfaces
use elpa_scalapack_interfaces
#endif
use elpa_abstract_impl
use elpa_abstract_impl
implicit none
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
real(kind=rk) :: eps
real(kind=rk) :: eps
#endif
logical, intent(in) :: useGPU
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik) :: l_cols, l_rows
logical, intent(in) :: useGPU
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik) :: l_cols, l_rows
#if REALCASE == 1
integer(kind=ik) :: vmrCols
integer(kind=ik) :: vmrCols
#endif
#ifdef WITH_OPENMP
integer(kind=ik) :: mynlc, lrs, transformChunkSize
integer(kind=ik) :: mynlc, lrs, transformChunkSize
#endif
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), pointer :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), pointer :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
#if REALCASE == 1
! needed for blocked QR decomposition
integer(kind=ik) :: PQRPARAM(11), work_size
real(kind=rk) :: dwork_size(1)
real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
! needed for blocked QR decomposition
integer(kind=ik) :: PQRPARAM(11), work_size
real(kind=rk) :: dwork_size(1)
real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
type(c_ptr) :: vmr_host, umc_host
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
type(c_ptr) :: vmr_host, umc_host
#ifdef WITH_MPI
!integer(kind=ik), external :: numroc -> use elpa_scalapack
!integer(kind=ik), external :: numroc -> use elpa_scalapack
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=ik) :: l_rows2, vmr_size2, umc_size2
integer(kind=c_intptr_t) :: lc_start, lc_end
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=ik) :: l_rows2, vmr_size2, umc_size2
integer(kind=c_intptr_t) :: lc_start, lc_end
#if COMPLEXCASE == 1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
#endif
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols
integer(kind=BLAS_KIND) :: na_colsBLAS
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols
integer(kind=BLAS_KIND) :: na_colsBLAS
#if COMPLEXCASE == 1
integer(kind=ik) :: na_rows
integer(kind=BLAS_KIND) :: na_rowsBLAS
integer(kind=ik) :: na_rows
integer(kind=BLAS_KIND) :: na_rowsBLAS
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: min_tile_size, error
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: min_tile_size, error
#if REALCASE == 1
logical, intent(in) :: useQR
logical, intent(in) :: useQR
#endif
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
integer(kind=ik), intent(in) :: max_threads
logical :: do_memcpy
integer(kind=ik) :: i_blk,blk_off
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX // &
gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
integer(kind=ik), intent(in) :: max_threads
logical :: do_memcpy
integer(kind=ik) :: i_blk,blk_off, blk_end
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX // &
gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
#endif
endif
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! na_rows in used nowhere; only na_cols
if (useGPU) then
! na_rows in used nowhere; only na_cols
if (useGPU) then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
int(my_prow,kind=BLAS_KIND), 0_BLAS_KIND, int(np_rows,kind=BLAS_KIND))
na_rows = int(na_rowsBLAS,kind=c_int)
na_rows = int(na_rowsBLAS,kind=c_int)
#endif
na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
int(my_pcol,kind=BLAS_KIND), 0_BLAS_KIND, int(np_cols,kind=BLAS_KIND))
na_cols = int(na_colsBLAS,kind=c_int)
na_cols = int(na_colsBLAS,kind=c_int)
#else
#if COMPLEXCASE == 1
na_rows = na
na_rows = na
#endif
na_cols = na
na_cols = na
#endif /* WITH_MPI */
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
check_alloc_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), &
nbw * nbw * size_of_datatype,&
cudaHostRegisterDefault)
check_host_register_cuda("bandred: vav", successCUDA)
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
check_alloc_cuda("bandred: vav_dev", successCUDA)
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for min_tile_size. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
check_alloc_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), &
nbw * nbw * size_of_datatype,&
cudaHostRegisterDefault)
check_host_register_cuda("bandred: vav", successCUDA)
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
check_alloc_cuda("bandred: vav_dev", successCUDA)
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for min_tile_size. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
#if REALCASE == 1
if (useQR) then
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: tauvector", istat, errorMessage)
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: tauvector", istat, errorMessage)
allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
check_allocate("bandred: blockheuristic", istat, errorMessage)
allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
check_allocate("bandred: blockheuristic", istat, errorMessage)
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: vmrCPU", istat, errorMessage)
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: vmrCPU", istat, errorMessage)
vmrCols = na
vmrCols = na
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif
work_size = int(dwork_size(1))
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
check_allocate("bandred: work_blocked", istat, errorMessage)
work_blocked = 0.0_rk
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: vmrCPU", istat, errorMessage)
work_size = int(dwork_size(1))
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
check_allocate("bandred: work_blocked", istat, errorMessage)
work_blocked = 0.0_rk
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: vmrCPU", istat, errorMessage)
endif ! which_qr_decomposition
endif ! which_qr_decomposition
endif ! useQr
endif ! useQr
#endif /* REALCASE */
if (useGPU) then
successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaHostRegisterDefault)
check_host_register_cuda("bandred: a_mat", successCUDA)
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype)
check_alloc_cuda("bandred: tmat_dev", successCUDA)
istep = (na-1)/nbw
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows,1)
cur_l_cols = max(l_cols,1)
vmr_size = cur_l_rows*2*n_cols
umc_size = cur_l_cols*2*n_cols
istep = (na-1)/nbw - 1
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows2,1)
cur_l_cols = max(l_cols,1)
vmr_size2 = cur_l_rows*2*n_cols
umc_size2 = cur_l_cols*2*n_cols
l_rows = max(l_rows,l_rows2)
vmr_size = max(vmr_size,vmr_size2)
umc_size = max(umc_size,umc_size2)
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vr "//errorMessage
stop 1
endif
blk_end = (na-1)/nbw
if (useGPU) then
successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaHostRegisterDefault)
check_host_register_cuda("bandred: a_mat", successCUDA)
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype)
check_alloc_cuda("bandred: tmat_dev", successCUDA)
istep = (na-1)/nbw
blk_end = (na-1)/nbw
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows,1)
cur_l_cols = max(l_cols,1)
vmr_size = cur_l_rows*2*n_cols
umc_size = cur_l_cols*2*n_cols
istep = (na-1)/nbw - 1
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows2,1)
cur_l_cols = max(l_cols,1)
vmr_size2 = cur_l_rows*2*n_cols
umc_size2 = cur_l_cols*2*n_cols
l_rows = max(l_rows,l_rows2)
vmr_size = max(vmr_size,vmr_size2)
umc_size = max(umc_size,umc_size2)
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vr "//errorMessage
stop 1
endif
successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype)
check_host_alloc_cuda("bandred: vmr_host", successCUDA)
call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/))
successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype)
check_host_alloc_cuda("bandred: vmr_host", successCUDA)
call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/))