Commit 42fd54c3 authored by Andreas Marek's avatar Andreas Marek

Some intendation

parent 11c33c77
......@@ -92,569 +92,571 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
!> \param useGPU If true, GPU version of the subroutine will be used
!> \param wantDebug if true more debug information
!>
subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a_mat, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
use matrix_plot
use elpa_omp
use elpa_blas_interfaces
implicit none
subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a_mat, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
use matrix_plot
use elpa_omp
use elpa_blas_interfaces
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(matrixRows,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(matrixRows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(matrixRows,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(matrixRows,matrixCols)
#endif
real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=MPI_KIND) :: mpierr
integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
max_local_cols
! updated after each istep (in the main cycle) to contain number of
! local columns and rows of the remaining part of the matrix
!integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: n_stored_vecs
integer(kind=C_intptr_T) :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
uv_stored_cols_dev
logical :: successCUDA
integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
integer(kind=c_intptr_t) :: a_offset
integer(kind=ik), intent(in) :: max_threads
real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=MPI_KIND) :: mpierr
integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
max_local_cols
! updated after each istep (in the main cycle) to contain number of
! local columns and rows of the remaining part of the matrix
!integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: n_stored_vecs
integer(kind=C_intptr_T) :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
uv_stored_cols_dev
logical :: successCUDA
integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
integer(kind=c_intptr_t) :: a_offset
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, n_threads, n_iter
integer(kind=ik) :: my_thread, n_threads, n_iter
#endif
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#if COMPLEXCASE == 1
complex(kind=rck) :: aux3(1)
complex(kind=rck) :: aux3(1)
#endif
integer(kind=c_intptr_t) :: num
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), pointer :: v_row(:), & ! used to store calculated Householder Vector
v_col(:) ! the same Vector, but transposed
MATH_DATATYPE(kind=rck), pointer :: u_col(:), u_row(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE(kind=rck), pointer :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
integer(kind=c_intptr_t) :: num
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), pointer :: v_row(:), & ! used to store calculated Householder Vector
v_col(:) ! the same Vector, but transposed
MATH_DATATYPE(kind=rck), pointer :: u_col(:), u_row(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE(kind=rck), pointer :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
type(c_ptr) :: v_row_host, v_col_host
type(c_ptr) :: u_row_host, u_col_host
type(c_ptr) :: vu_stored_rows_host, uv_stored_cols_host
real(kind=rk), allocatable :: tmp_real(:)
integer(kind=ik) :: min_tile_size, error
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call obj%get("is_skewsymmetric",skewsymmetric,istat)
if (istat .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX // &
gpuString )
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
my_prow = int(my_prowMPI, kind=c_int)
np_rows = int(np_rowsMPI, kind=c_int)
my_pcol = int(my_pcolMPI, kind=c_int)
np_cols = int(np_colsMPI, kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! seems that tile is a square submatrix, consisting by several blocks
! it is a smallest possible square submatrix, where blocks being distributed among
! processors are "aligned" in both rows and columns
! -----------------
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- ...
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- .
! : : : : : : .
! : : : : : : .
!
! this is a tile, where each number represents block, assigned to a processor with the shown number
! size of this small block is nblk
! Image is for situation with 6 processors, 3 processor rows and 2 columns
! tile_size is thus nblk * 6
!
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for min_tile_size. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_loc_block_rows = (totalblocks-1)/np_rows + 1
max_loc_block_cols = (totalblocks-1)/np_cols + 1
! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
max_local_rows = max_loc_block_rows*nblk
max_local_cols = max_loc_block_cols*nblk
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = Vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
successCUDA = cuda_malloc_host(v_row_host,num)
check_host_alloc_cuda("tridiag: v_row_host", successCUDA)
call c_f_pointer(v_row_host,v_row,(/(max_local_rows+1)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(v_col_host,num)
check_host_alloc_cuda("tridiag: v_col_host", successCUDA)
call c_f_pointer(v_col_host,v_col,(/(max_local_cols)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(u_col_host,num)
check_host_alloc_cuda("tridiag: u_col_host", successCUDA)
call c_f_pointer(u_col_host,u_col,(/(max_local_cols)/))
num = (max_local_rows) * size_of_datatype
successCUDA = cuda_malloc_host(u_row_host,num)
check_host_alloc_cuda("tridiag: u_row_host", successCUDA)
call c_f_pointer(u_row_host,u_row,(/(max_local_rows)/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
type(c_ptr) :: v_row_host, v_col_host
type(c_ptr) :: u_row_host, u_col_host
type(c_ptr) :: vu_stored_rows_host, uv_stored_cols_host
real(kind=rk), allocatable :: tmp_real(:)
integer(kind=ik) :: min_tile_size, error
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=ik) :: nblockEnd
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call obj%get("is_skewsymmetric",skewsymmetric,istat)
if (istat .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX // &
gpuString )
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
my_prow = int(my_prowMPI, kind=c_int)
np_rows = int(np_rowsMPI, kind=c_int)
my_pcol = int(my_pcolMPI, kind=c_int)
np_cols = int(np_colsMPI, kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! seems that tile is a square submatrix, consisting by several blocks
! it is a smallest possible square submatrix, where blocks being distributed among
! processors are "aligned" in both rows and columns
! -----------------
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- ...
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- .
! : : : : : : .
! : : : : : : .
!
! this is a tile, where each number represents block, assigned to a processor with the shown number
! size of this small block is nblk
! Image is for situation with 6 processors, 3 processor rows and 2 columns
! tile_size is thus nblk * 6
!
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for min_tile_size. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
nblockEnd = 3
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_loc_block_rows = (totalblocks-1)/np_rows + 1
max_loc_block_cols = (totalblocks-1)/np_cols + 1
! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
max_local_rows = max_loc_block_rows*nblk
max_local_cols = max_loc_block_cols*nblk
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = Vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
successCUDA = cuda_malloc_host(v_row_host,num)
check_host_alloc_cuda("tridiag: v_row_host", successCUDA)
call c_f_pointer(v_row_host,v_row,(/(max_local_rows+1)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(v_col_host,num)
check_host_alloc_cuda("tridiag: v_col_host", successCUDA)
call c_f_pointer(v_col_host,v_col,(/(max_local_cols)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(u_col_host,num)
check_host_alloc_cuda("tridiag: u_col_host", successCUDA)
call c_f_pointer(u_col_host,u_col,(/(max_local_cols)/))
num = (max_local_rows) * size_of_datatype
successCUDA = cuda_malloc_host(u_row_host,num)
check_host_alloc_cuda("tridiag: u_row_host", successCUDA)
call c_f_pointer(u_row_host,u_row,(/(max_local_rows)/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num = na * 8
num = na * 8
#else
num = na * 4
num = na * 4
#endif
successCUDA = cuda_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
successCUDA = cuda_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: e_vec", successCUDA)
check_host_register_cuda("tridiag: e_vec", successCUDA)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num = na * 8
num = na * 8
#else
num = na * 4
num = na * 4
#endif
successCUDA = cuda_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
successCUDA = cuda_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: d_vec", successCUDA)
check_host_register_cuda("tridiag: d_vec", successCUDA)
else
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
endif
else
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
endif
#ifdef WITH_OPENMP
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uc_p", istat, errorMessage)
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uc_p", istat, errorMessage)
#endif /* WITH_OPENMP */
tmp = 0
v_row = 0
u_row = 0
v_col = 0
u_col = 0
tmp = 0
v_row = 0
u_row = 0
v_col = 0
u_col = 0
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successCUDA)
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: u_row_dev", successCUDA)
check_alloc_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
endif !useGPU
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
endif !useGPU
d_vec(:) = 0
e_vec(:) = 0
tau(:) = 0
d_vec(:) = 0
e_vec(:) = 0
tau(:) = 0
n_stored_vecs = 0
n_stored_vecs = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
#if COMPLEXCASE == 1
d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
#endif
#if REALCASE == 1
d_vec(na) = a_mat(l_rows,l_cols)
d_vec(na) = a_mat(l_rows,l_cols)
#endif
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
if (useGPU) then