Commit 4c19739b authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove (buggy) OpenMP functionality from old

ELPA_development_version_branch

Since the new, tested branch ELPA_development_version_OpenMP is
now in place, the still buggy OpenMP functionality from the branch
ELPA_development_version is removed.

The branch ELPA_development_version now only contains the untested
support of the MRRR algorithm
parent c7a581e1
......@@ -351,13 +351,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
integer istep, i, j, lcs, lce, lrs, lre
integer tile_size, l_rows_tile, l_cols_tile
integer my_thread, n_threads, max_threads, n_iter
!$ integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
real*8 vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
real*8, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
real*8, allocatable:: ur_p(:,:), uc_p(:,:)
integer pcol, prow
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
......@@ -391,11 +387,6 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
allocate(vc(max_local_cols))
allocate(uc(max_local_cols))
max_threads = 1
!$ max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1))
allocate(uc_p(max_local_cols,0:max_threads-1))
tmp = 0
vr = 0
ur = 0
......@@ -487,16 +478,6 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
ur(1:l_rows) = 0
if(l_rows>0 .and. l_cols>0) then
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = 0
n_threads = 1
!$ my_thread = omp_get_thread_num()
!$ n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
......@@ -505,19 +486,11 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
lrs = j*l_rows_tile+1
lre = min(l_rows,(j+1)*l_rows_tile)
if(lre<lrs) cycle
if(mod(n_iter,n_threads) == my_thread) then
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
if(i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
endif
n_iter = n_iter+1
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
if(i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
n_iter = n_iter+1
enddo
enddo
!$OMP END PARALLEL
do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo
if(nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1)
......@@ -1055,14 +1028,10 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
integer istep, i, j, lcs, lce, lrs, lre
integer tile_size, l_rows_tile, l_cols_tile
integer my_thread, n_threads, max_threads, n_iter
!$ integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
real*8 vnorm2
complex*16 vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
complex*16, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
complex*16, allocatable:: ur_p(:,:), uc_p(:,:)
real*8, allocatable:: tmpr(:)
integer pcol, prow
......@@ -1097,11 +1066,6 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
allocate(vc(max_local_cols))
allocate(uc(max_local_cols))
max_threads = 1
!$ max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1))
allocate(uc_p(max_local_cols,0:max_threads-1))
tmp = 0
vr = 0
ur = 0
......@@ -1193,16 +1157,6 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
ur(1:l_rows) = 0
if(l_rows>0 .and. l_cols>0) then
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = 0
n_threads = 1
!$ my_thread = omp_get_thread_num()
!$ n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
......@@ -1211,19 +1165,11 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
lrs = j*l_rows_tile+1
lre = min(l_rows,(j+1)*l_rows_tile)
if(lre<lrs) cycle
if(mod(n_iter,n_threads) == my_thread) then
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc_p(lcs,my_thread),1)
if(i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur_p(lrs,my_thread),1)
endif
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc(lcs),1)
if(i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur(lrs),1)
n_iter = n_iter+1
enddo
enddo
!$OMP END PARALLEL
do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo
if(nstor>0) then
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1)
......@@ -2180,7 +2126,6 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
real*8 z(na), d1(na), d2(na), z1(na), delta(na), dbase(na), ddiff(na), ev_scale(na), tmp(na)
real*8 d1u(na), zu(na), d1l(na), zl(na)
real*8, allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
real*8, allocatable :: z_p(:,:)
integer i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, l_rqm, ns, info
integer l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, l_cols_qreorg, np, l_idx, nqcols1, nqcols2
......@@ -2189,13 +2134,6 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
integer idx(na), idx1(na), idx2(na)
integer coltyp(na), idxq1(na), idxq2(na)
integer max_threads, my_thread
!$ integer omp_get_max_threads, omp_get_thread_num
max_threads = 1
!$ max_threads = omp_get_max_threads()
allocate(z_p(na,0:max_threads-1))
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
......@@ -2468,16 +2406,11 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
! Solve secular equation
z(1:na1) = 1
z_p(1:na1,:) = 1
dbase(1:na1) = 0
ddiff(1:na1) = 0
info = 0
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
my_thread = 0
!$ my_thread = omp_get_thread_num()
!$OMP DO
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
......@@ -2492,9 +2425,9 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
! Compute updated z
do j=1,na1
if(i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
if(i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
enddo
z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
z(i) = z(i)*delta(i)
! store dbase/ddiff
......@@ -2511,10 +2444,6 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
ddiff(i) = delta(i)
endif
enddo
!$OMP END PARALLEL
do i = 0, max_threads-1
z(1:na1) = z(1:na1)*z_p(1:na1,i)
enddo
call global_product(z, na1)
z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
......@@ -2527,7 +2456,6 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
ev_scale(:) = 0
!$OMP PARALLEL DO PRIVATE(i,tmp)
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code
......@@ -2541,7 +2469,7 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
tmp(1:na1) = z(1:na1) / tmp(1:na1)
ev_scale(i) = 1.0/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
enddo
!$OMP END PARALLEL DO
call global_gather(ev_scale, na1)
! Add the deflated eigenvalues
......
......@@ -922,10 +922,8 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
integer my_prow, np_rows, my_pcol, np_cols
integer ireq_ab, ireq_hv
integer na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
integer max_threads, my_thread, my_block_s, my_block_e, iter
integer mpi_status(MPI_STATUS_SIZE)
integer mpi_status(MPI_STATUS_SIZE)
integer, allocatable :: mpi_statuses(:,:)
integer, allocatable :: omp_block_limits(:)
integer, allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), global_id_tmp(:,:), hh_cnt(:), hh_dst(:)
integer, allocatable :: limits(:), snd_limits(:,:)
integer, allocatable :: block_limits(:)
......@@ -933,7 +931,6 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
! dummies for calling redist_band
complex*16 :: c_a(1,1), c_ab(1,1)
!$ integer :: omp_get_max_threads
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
......@@ -1057,25 +1054,6 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
! OpenMP work distribution:
max_threads = 1
!$ max_threads = omp_get_max_threads()
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if(max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads))
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads))
hv_t = 0
tau_t = 0
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
......@@ -1087,7 +1065,7 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
do istep=1,na-1-block_limits(my_pe)*nb
do istep=1,na-1
if(my_pe==0) then
n = MIN(na-na_s,nb) ! number of rows to be reduced
......@@ -1124,171 +1102,6 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
n_off = n_off + nb
endif
if(max_threads > 1) then
! Codepath for OpenMP
! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
! This simulates the behaviour of the MPI tasks which also work after each other.
! The code would be considerably easier, if the MPI communication would be made within
! the parallel region - this is avoided here since this would require
! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
hv_t(:,1) = hv
tau_t(1) = tau
do iter = 1, 2
! iter=1 : work on first block
! iter=2 : work on remaining blocks
! This is done in 2 iterations so that we have a barrier in between:
! After the first iteration, it is guaranteed that the last row of the last block
! is completed by the next thread.
! After the first iteration it is also the place to exchange the last row
! with MPI calls
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if(iter == 1) then
my_block_s = omp_block_limits(my_thread-1) + 1
my_block_e = my_block_s
else
my_block_s = omp_block_limits(my_thread-1) + 2
my_block_e = omp_block_limits(my_thread)
endif
do iblk = my_block_s, my_block_e
ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
ne = ns+nb-1 ! last column in block
if(istep<my_thread .or. ns+n_off>na) exit
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
hv_t(:,my_thread) = 0
tau_t(my_thread) = 0
if(nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
if(nr>1) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation for first column
! (stored in hv_t(:,my_thread) and tau_t(my_thread))
vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
hv_t(1 ,my_thread) = 1.
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1)
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i)
enddo
else
! No new Householder transformation for nr=1, just complete the old one
ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
enddo
! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
hv_t(1,my_thread) = 1.
endif
enddo
enddo ! my_thread
!$omp end parallel do
if (iter==1) then
! We are at the end of the first block
! Send our first column to previous PE
if(my_pe>0 .and. na_s <= na) then
call mpi_wait(ireq_ab,mpi_status,mpierr)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
! Request last column from next PE
ne = na_s + nblocks*nb - (max_threads-1) - 1
if(istep>=max_threads .and. ne <= na) then
call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
endif
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if(istep>=max_threads .and. ne < na) then
call mpi_wait(ireq_hv,mpi_status,mpierr)
hv_s(1) = tau_t(max_threads)
hv_s(2:) = hv_t(2:,max_threads)
call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
! "Send" HH vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
enddo
endif
enddo ! iter
else
! Codepath for 1 thread without OpenMP
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
......@@ -1302,6 +1115,27 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
......@@ -1418,31 +1252,6 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm
enddo
endif
do iblk = 1, nblocks
if(hh_dst(iblk) >= np_rows) exit
if(snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
enddo
enddo
! Finish the last outstanding requests
call mpi_wait(ireq_ab,mpi_status,mpierr)
call mpi_wait(ireq_hv,mpi_status,mpierr)
......@@ -1505,16 +1314,16 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
integer current_n, current_local_n, current_n_start, current_n_end
integer next_n, next_local_n, next_n_start, next_n_end
integer bottom_msg_length, top_msg_length, next_top_msg_length
integer thread_width, stripe_width, stripe_count, csw
integer stripe_width, last_stripe_width, stripe_count
integer num_result_blocks, num_result_buffers, num_bufs_recvd
integer a_off, current_tv_off, max_blk_size, b_off, b_len
integer a_off, current_tv_off, max_blk_size
integer mpierr, src, src_offset, dst, offset, nfact, num_blk
integer mpi_status(MPI_STATUS_SIZE)
logical flag
real*8, allocatable :: a(:,:,:,:), row(:)
real*8, allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
real*8, allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
real*8, allocatable :: a(:,:,:), row(:)
real*8, allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
real*8, allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
real*8, allocatable :: result_buffer(:,:,:)
real*8, allocatable :: bcast_buffer(:,:)
......@@ -1530,9 +1339,6 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
integer, parameter :: top_recv_tag = 222
integer, parameter :: result_recv_tag = 333
integer :: max_threads, my_thread
!$ integer :: omp_get_max_threads
! Just for measuring the kernel performance
real*8 kernel_time
integer*8 kernel_flops
......@@ -1541,9 +1347,6 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
kernel_time = 1.d-100
kernel_flops = 0
max_threads = 1
!$ max_threads = omp_get_max_threads()
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
......@@ -1564,18 +1367,17 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if(l_nev==0) then
thread_width = 0
stripe_width = 0
stripe_count = 0
else
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
stripe_width = 48 ! Must be a multiple of 4
stripe_count = (thread_width-1)/stripe_width + 1
stripe_count = (l_nev-1)/stripe_width + 1
! Adapt stripe width so that last one doesn't get too small
stripe_width = (thread_width-1)/stripe_count + 1
stripe_width = (l_nev-1)/stripe_count + 1
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
endif
! Determine the matrix distribution at the beginning
......@@ -1589,8 +1391,8 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
allocate(a(stripe_width,a_dim2,stripe_count,max_threads))
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
allocate(a(stripe_width,a_dim2,stripe_count))
a(:,:,:) = 0
allocate(row(l_nev))
row(:) = 0
......@@ -1601,15 +1403,6 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!