Commit ab19611c authored by Andreas Marek's avatar Andreas Marek

Changes to OpenMP

parent 04c4c95f
......@@ -57,16 +57,19 @@
subroutine merge_systems_&
&PRECISION &
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success)
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, npc_0, npc_n
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, npc_0, npc_n
integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=REAL_DATATYPE), intent(inout) :: d(na), e
#ifdef USE_ASSUMED_SIZE
......@@ -74,49 +77,46 @@
#else
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
! TODO: play with max_strip. If it was larger, matrices being multiplied
! might be larger as well!
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
qtrans(2,2), dmax, zmax, d1new, d2new
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
qtrans(2,2), dmax, zmax, d1new, d2new
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_real
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_real
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads, omp_get_thread_num
max_threads = omp_get_max_threads()
integer(kind=ik) :: my_thread
allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -438,7 +438,7 @@
#ifdef WITH_OPENMP
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
! OPENMP_CHANGE here
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
my_thread = omp_get_thread_num()
!$OMP DO
......
......@@ -57,31 +57,32 @@
subroutine solve_tridi_&
&PRECISION_AND_SUFFIX &
( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, useGPU, wantDebug, success )
mpi_comm_cols, useGPU, wantDebug, success, max_threads )
use precision
use elpa_abstract_impl
implicit none
#include "../../src/general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=REAL_DATATYPE), intent(inout) :: d(na), e(na)
integer(kind=ik), intent(in) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=REAL_DATATYPE), intent(inout) :: d(na), e(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=ik), intent(in) :: max_threads
if(useGPU) then
gpuString = "_gpu"
......@@ -152,7 +153,7 @@ subroutine solve_tridi_&
call solve_tridi_col_&
&PRECISION_AND_SUFFIX &
(obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, useGPU, wantDebug, success)
matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads)
if (.not.(success)) then
call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
......@@ -335,7 +336,7 @@ subroutine solve_tridi_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success )
l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success, max_threads )
if (.not.(success)) return
else
! Not last merge, leave dense column distribution
......@@ -343,7 +344,7 @@ subroutine solve_tridi_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success )
l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success, max_threads )
if (.not.(success)) return
endif
end subroutine merge_recursive_&
......@@ -354,7 +355,7 @@ subroutine solve_tridi_&
subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX &
( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success )
( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
......@@ -385,6 +386,8 @@ subroutine solve_tridi_&
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik), intent(in) :: max_threads
call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -557,7 +560,7 @@ subroutine solve_tridi_&
&PRECISION &
(obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success)
l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success, max_threads)
if (.not.(success)) return
enddo
......
......@@ -110,6 +110,7 @@ function elpa_solve_evp_&
mpi_comm_all, check_pd, i, error
logical :: do_bandred, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads, omp_get_num_threads
call obj%timer%start("elpa_solve_evp_&
&MATH_DATATYPE&
......@@ -117,6 +118,13 @@ function elpa_solve_evp_&
&PRECISION&
&")
#ifdef WITH_OPENMP
nrThreads = omp_get_num_threads()
#else
nrThreads = 1
#endif
success = .true.
if (present(q)) then
......@@ -309,7 +317,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug)
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
call obj%timer%stop("forward")
endif !do_bandred
......@@ -324,7 +332,7 @@ function elpa_solve_evp_&
#if COMPLEXCASE == 1
q_real, l_rows, &
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success)
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
call obj%timer%stop("solve")
if (.not.(success)) return
endif !do_solve
......
......@@ -96,12 +96,15 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug)
(obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
use matrix_plot
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
......@@ -150,11 +153,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
integer(kind=c_intptr_t) :: a_offset
integer(kind=c_intptr_t) :: a_offset
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter
integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
integer(kind=ik) :: my_thread, n_threads, n_iter
#endif
real(kind=REAL_DATATYPE) :: vnorm2
......@@ -305,8 +308,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE ", "u_col", istat, errorMessage)
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
......@@ -502,7 +503,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&_&
&PRECISION &
(obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
1, istep-1, 1, nblk, max_threads)
! Calculate u = (A + VU**T + UV**T)*v
......@@ -532,7 +533,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
! debug REMOVE again
print *,"debug"
if (n_threads .ne. max_threads) then
print *,"WTF?"
endif
n_iter = 0
......@@ -710,7 +717,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&_&
&PRECISION &
(obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
mpi_comm_cols, istep-1, 1, nblk)
mpi_comm_cols, istep-1, 1, nblk, max_threads)
endif
......@@ -733,7 +740,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&_&
&PRECISION &
(obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk)
mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
......
......@@ -48,6 +48,9 @@
use elpa_mpi
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
......@@ -68,6 +71,7 @@
logical :: success
integer(kind=ik) :: istat, debug, error
character(200) :: errorMessage
integer(kind=ik) :: max_threads
call obj%timer%start("elpa_cholesky_&
&MATH_DATATYPE&
......@@ -75,6 +79,12 @@
&PRECISION&
&")
#ifdef WITH_OPENMP
max_threads=omp_get_num_threads()
#else
max_threads=1
#endif
na = obj%na
lda = obj%local_nrows
nblk = obj%nblk
......@@ -285,7 +295,7 @@
&PRECISION &
(obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
n, na, nblk, nblk)
n, na, nblk, nblk, max_threads)
do i=0,(na-1)/tile_size
lcs = max(l_colx,i*l_cols_tile+1)
......
......@@ -50,7 +50,7 @@ subroutine elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvr, nvc, nblk)
(obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvr, nvc, nblk, nrThreads)
!-------------------------------------------------------------------------------
! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
......@@ -81,7 +81,7 @@ subroutine elpa_reduce_add_vectors_&
use elpa_abstract_impl
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in) :: vmat_s(ld_s,nvc)
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: vmat_t(ld_t,nvc)
......@@ -91,6 +91,8 @@ subroutine elpa_reduce_add_vectors_&
integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr
integer(kind=ik) :: lcm_s_t, nblks_tot
integer(kind=ik) :: auxstride
integer(kind=ik), intent(in) :: nrThreads
call obj%timer%start("elpa_reduce_add_vectors_&
&MATH_DATATYPE&
......@@ -119,6 +121,8 @@ subroutine elpa_reduce_add_vectors_&
aux1(:) = 0
aux2(:) = 0
#ifdef WITH_OPENMP
call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
#endif
do n = 0, lcm_s_t-1
......
......@@ -63,6 +63,9 @@
&_impl
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
......@@ -78,6 +81,7 @@
logical :: success
integer :: debug, error
integer :: max_threads
call obj%timer%start("elpa_solve_tridi_public_&
&MATH_DATATYPE&
......@@ -90,6 +94,12 @@
ldq = obj%local_nrows
matrixCols = obj%local_ncols
#ifdef WITH_OPENMP
max_threads=omp_get_num_threads()
#else
max_threads=1
#endif
call obj%get("mpi_comm_rows", mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
......@@ -116,7 +126,8 @@
call solve_tridi_&
&PRECISION&
&_private_impl(obj, na, nev, d, e, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols,.false., wantDebug, success)
mpi_comm_rows, mpi_comm_cols,.false., wantDebug, success, &
max_threads)
call obj%timer%stop("elpa_solve_tridi_public_&
&MATH_DATATYPE&
......
......@@ -54,7 +54,7 @@ subroutine elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvs, nvr, nvc, nblk)
(obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvs, nvr, nvc, nblk, nrThreads)
!-------------------------------------------------------------------------------
! This routine transposes an array of vectors which are distributed in
......@@ -94,6 +94,7 @@ subroutine elpa_transpose_vectors_&
integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr
integer(kind=ik) :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
integer(kind=ik) :: auxstride
integer(kind=ik), intent(in) :: nrThreads
call obj%timer%start("elpa_transpose_vectors_&
&MATH_DATATYPE&
......
......@@ -51,9 +51,9 @@
&_&
#endif
&PRECISION &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
#ifdef WITH_OPENMP
max_threads, l_nev, &
l_nev, &
#endif
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
......@@ -123,6 +123,7 @@
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
integer(kind=ik), intent(in) :: max_threads
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
#if REALCASE == 1
......@@ -135,7 +136,7 @@
#endif
#else /* WITH_OPENMP */
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
integer(kind=ik), intent(in) :: l_nev, thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
......
......@@ -65,13 +65,11 @@
&_&
&PRECISION &
(obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, useGPU, success &
tmat_dev, wantDebug, useGPU, success, &
#if REALCASE == 1
, useQR)
#endif
#if COMPLEXCASE == 1
)
useQR, &
#endif
max_threads)
!-------------------------------------------------------------------------------
! bandred_real/complex: Reduces a distributed symmetric matrix to band form
......@@ -187,6 +185,7 @@
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
integer(kind=ik), intent(in) :: max_threads
call obj%timer%start("bandred_&
......@@ -957,7 +956,7 @@
&PRECISION &
(obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads)
else ! useGPU
call elpa_transpose_vectors_&
&MATH_DATATYPE&
......@@ -965,7 +964,7 @@
&PRECISION &
(obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk, max_threads)
endif
! Calculate umc = A**T * vmr
......@@ -1011,7 +1010,7 @@
#ifdef WITH_OPENMP
#if REALCASE == 1
n_way = omp_get_max_threads()
n_way = max_threads
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
......@@ -1210,7 +1209,7 @@
&PRECISION &
(obj, vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows, &
mpi_comm_rows, umcCUDA, &
cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk)
cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk, max_threads)
else ! useGPU
call elpa_reduce_add_vectors_&
......@@ -1219,7 +1218,7 @@
&PRECISION &
(obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
istep*nbw, n_cols, nblk, max_threads)
endif ! useGPU
endif ! tile_size < istep*nbw .or. n_way > 1
......@@ -1415,7 +1414,7 @@
&PRECISION &
(obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), &
......@@ -1456,7 +1455,7 @@
&PRECISION &
(obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
1, istep*nbw, n_cols, nblk, max_threads)
endif ! useGPU
......@@ -1464,8 +1463,13 @@
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
! OPENMP_CHANGE here
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
print *,"debug"
if (n_threads .ne. max_threads) then
print *,"WTF2"
endif
if (mod(n_threads, 2) == 0) then
n_way = 2
else
......
......@@ -63,6 +63,9 @@
use elpa_mpi