Commit 3c6e3a1d authored by Andreas Marek's avatar Andreas Marek

Some intendation

parent 42fd54c3
......@@ -51,830 +51,832 @@
#include "../general/sanity.F90"
subroutine tridiag_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nb, nblk, a_mat, lda, d, e, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
!-------------------------------------------------------------------------------
! tridiag_band_real/complex:
! Reduces a real symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a_mat(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! hh_trans : housholder vectors
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! communicator
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
use elpa_abstract_impl
use elpa2_workload
use precision
use iso_c_binding
use redist
subroutine tridiag_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nb, nblk, a_mat, lda, d, e, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
!-------------------------------------------------------------------------------
! tridiag_band_real/complex:
! Reduces a real symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a_mat(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! hh_trans : housholder vectors
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! communicator
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
use elpa_abstract_impl
use elpa2_workload
use precision
use iso_c_binding
use redist
#ifdef WITH_OPENMP
use omp_lib
use omp_lib
#endif
use elpa_blas_interfaces
use elpa_skewsymmetric_blas
implicit none
use elpa_blas_interfaces
use elpa_skewsymmetric_blas
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(in) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(in) :: a_mat(lda,*)
#else
MATH_DATATYPE(kind=rck), intent(in) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(in) :: a_mat(lda,matrixCols)
#endif
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
MATH_DATATYPE(kind=rck), intent(out), allocatable :: hh_trans(:,:)
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
MATH_DATATYPE(kind=rck) :: hd(nb), hs(nb)
integer(kind=ik) :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=MPI_KIND) :: my_peMPI, n_pesMPI, mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=MPI_KIND) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
integer(kind=ik), intent(in) :: nrThreads
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
MATH_DATATYPE(kind=rck), intent(out), allocatable :: hh_trans(:,:)
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
MATH_DATATYPE(kind=rck) :: hd(nb), hs(nb)
integer(kind=ik) :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=MPI_KIND) :: my_peMPI, n_pesMPI, mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=MPI_KIND) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
integer(kind=ik), intent(in) :: nrThreads
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
#endif
integer(kind=ik), allocatable :: global_id_tmp(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
MATH_DATATYPE(kind=rck), allocatable :: hv_t(:,:), tau_t(:)
integer(kind=ik), allocatable :: global_id_tmp(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
MATH_DATATYPE(kind=rck), allocatable :: hv_t(:,:), tau_t(:)
#endif /* WITH_OPENMP */
integer(kind=ik), allocatable :: global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=MPI_KIND), allocatable :: ireq_hhr(:), ireq_hhs(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=ik), allocatable :: global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=MPI_KIND), allocatable :: ireq_hhr(:), ireq_hhs(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer :: istat
integer(kind=ik) :: nblockEnd
character(200) :: errorMessage
character(20) :: gpuString
#ifndef WITH_MPI
integer(kind=ik) :: startAddr
integer(kind=ik) :: startAddr
#endif
call obj%get("is_skewsymmetric",skewsymmetric,istat)
if (istat .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX //&
gpuString)
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI ,mpierr)
my_pe = int(my_peMPI,kind=MPI_KIND)
n_pes = int(n_pesMPI,kind=MPI_KIND)
my_prow = int(my_prowMPI,kind=MPI_KIND)
np_rows = int(np_rowsMPI,kind=MPI_KIND)
my_pcol = int(my_pcolMPI,kind=MPI_KIND)
np_cols = int(np_colsMPI,kind=MPI_KIND)
if (wantDebug) call obj%timer%stop("mpi_communication")
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: global_id", istat, errorMessage)
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
call obj%get("is_skewsymmetric",skewsymmetric,istat)
if (istat .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX //&
gpuString)
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI ,mpierr)
my_pe = int(my_peMPI,kind=MPI_KIND)
n_pes = int(n_pesMPI,kind=MPI_KIND)
my_prow = int(my_prowMPI,kind=MPI_KIND)
np_rows = int(np_rowsMPI,kind=MPI_KIND)
my_pcol = int(my_pcolMPI,kind=MPI_KIND)
np_cols = int(np_colsMPI,kind=MPI_KIND)
if (wantDebug) call obj%timer%stop("mpi_communication")
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: global_id", istat, errorMessage)
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
#ifdef WITH_OPENMP
allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: global_id_tmp", istat, errorMessage)
allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: global_id_tmp", istat, errorMessage)
#endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
#ifndef WITH_OPENMP
call mpi_allreduce(mpi_in_place, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
call mpi_allreduce(mpi_in_place, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
#else
global_id_tmp(:,:) = global_id(:,:)
call mpi_allreduce(global_id_tmp, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
check_deallocate("tridiag_band: global_id_tmp", istat, errorMessage)
global_id_tmp(:,:) = global_id(:,:)
call mpi_allreduce(global_id_tmp, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
check_deallocate("tridiag_band: global_id_tmp", istat, errorMessage)
#endif /* WITH_OPENMP */
if (wantDebug) call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
! Total number of blocks in the band:
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
nblocks_total = (na-1)/nb + 1
! Set work distribution
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: block_limits", istat, errorMessage)
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: block_limits", istat, errorMessage)
call divide_band(obj,nblocks_total, n_pes, block_limits)
call divide_band(obj,nblocks_total, n_pes, block_limits)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: ab", istat, errorMessage)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: ab", istat, errorMessage)
ab = 0.0_rck ! needed for lower half, the extra block should also be set to 0 for safety
ab = 0.0_rck ! needed for lower half, the extra block should also be set to 0 for safety
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! Redistribute band in a to ab
call redist_band_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(obj,a_mat, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: limits", istat, errorMessage)
call determine_workload(obj,na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(obj, nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_trans", istat, errorMessage)
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
check_allocate("tridiag_band: ireq_hhr", istat, errorMessage)
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
check_allocate("tridiag_band: ireq_hhs", istat, errorMessage)
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
do n = 1, nblocks_total
call determine_workload(obj,nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
! Redistribute band in a to ab
call redist_band_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(obj,a_mat, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: limits", istat, errorMessage)
call determine_workload(obj,na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(obj, nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_trans", istat, errorMessage)
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
check_allocate("tridiag_band: ireq_hhr", istat, errorMessage)
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
check_allocate("tridiag_band: ireq_hhs", istat, errorMessage)
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
nBlockEnd=1
do n = 1, nblocks_total
call determine_workload(obj,nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_irecv(hh_trans(1,num_hh_vecs+1), int(nb*local_size,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(nt,kind=MPI_KIND), int(10+n-block_limits(nt),kind=MPI_KIND), &
int(communicator,kind=MPI_KIND), ireq_hhr(num_chunks), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_irecv(hh_trans(1,num_hh_vecs+1), int(nb*local_size,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(nt,kind=MPI_KIND), int(10+n-block_limits(nt),kind=MPI_KIND), &
int(communicator,kind=MPI_KIND), ireq_hhr(num_chunks), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! carefull non-block recv data copy must be done at wait or send
! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
! carefull non-block recv data copy must be done at wait or send
! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
#endif /* WITH_MPI */
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
#ifdef WITH_MPI
ireq_hhs(:) = MPI_REQUEST_NULL
ireq_hhs(:) = MPI_REQUEST_NULL
#endif
! Buffers for gathering/sending the HH vectors
! Buffers for gathering/sending the HH vectors
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
check_allocate("tridiag_band: hh_gath", istat, errorMessage)
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
check_allocate("tridiag_band: hh_gath", istat, errorMessage)
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
check_allocate("tridiag_band: hh_send", istat, errorMessage)
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
check_allocate("tridiag_band: hh_send", istat, errorMessage)
hh_gath(:,:,:) = 0.0_rck
hh_send(:,:,:) = 0.0_rck
hh_gath(:,:,:) = 0.0_rck
hh_send(:,:,:) = 0.0_rck
! Some counters
! Some counters
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_cnt", istat, errorMessage)
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_cnt", istat, errorMessage)
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_dst", istat, errorMessage)
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hh_dst", istat, errorMessage)
hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
#endif
! Limits for sending
! Limits for sending
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: snd_limits", istat, errorMessage)
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: snd_limits", istat, errorMessage)
do iblk=1,nblocks
call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
do iblk=1,nblocks
call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
#ifdef WITH_OPENMP
! OpenMP work distribution:
max_threads = nrThreads
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
! OpenMP work distribution:
max_threads = nrThreads
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: omp_block_limits", istat, errorMessage)
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: omp_block_limits", istat, errorMessage)
! Get the OpenMP block limits
call divide_band(obj,nblocks, max_threads, omp_block_limits)
! Get the OpenMP block limits
call divide_band(obj,nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hv_t", istat, errorMessage)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
check_allocate("tridiag_band: hv_t", istat, errorMessage)
hv_t = 0.0_rck
tau_t = 0.0_rck
hv_t = 0.0_rck
tau_t = 0.0_rck
#endif /* WITH_OPENMP */
! ---------------------------------------------------------------------------
! Start of calculations
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECIS