Commit fe63372d authored by Alexander Heinecke's avatar Alexander Heinecke

This commit improves ELPA's performance on Intel(R) Xeon(R) E5v2 and E5v3 series CPUs by:

- enabling fusing iterations of stage 5 in ELPA2 for every configuration
- Changed reduction bandwidth in ELPA2 to be at least 64
- partial OpenMP parallelization of the QR factorization in bandred_real
- OpenMP parallelization of SYMM
- OpenMP parallelization of SYR2K in bandred_real
- OpenMP parallelization for elpa1_reduce_add_vectors and elpa1_transpose_vectors
- AVX2 support in backtransformation elpa2_kernels (FMA3 instructions introduced with Haswell microarchitecture)
parent bf168297
......@@ -14,6 +14,9 @@
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
......@@ -4274,11 +4277,14 @@ subroutine elpa_transpose_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,
!-------------------------------------------------------------------------------
use ELPA1 ! for least_common_multiple
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
include 'mpif.h'
integer, intent(in) :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
real*8, intent(in) :: vmat_s(ld_s,nvc)
real*8, intent(inout) :: vmat_t(ld_t,nvc)
......@@ -4287,6 +4293,8 @@ subroutine elpa_transpose_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,
integer myps, mypt, nps, npt
integer n, lc, k, i, ips, ipt, ns, nl, mpierr
integer lcm_s_t, nblks_tot, nblks_comm, nblks_skip
integer auxstride
integer, parameter :: error_unit = 6
call mpi_comm_rank(comm_s,myps,mpierr)
call mpi_comm_size(comm_s,nps ,mpierr)
......@@ -4310,8 +4318,9 @@ subroutine elpa_transpose_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,
nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
allocate( aux( (nblks_tot-nblks_skip+lcm_s_t-1) / lcm_s_t * nblk * nvc ) )
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
do n = 0, lcm_s_t-1
ips = mod(n,nps)
......@@ -4320,35 +4329,41 @@ subroutine elpa_transpose_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,
if(mypt == ipt) then
nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
if(nblks_comm==0) cycle
auxstride = nblk * nblks_comm
if(nblks_comm .ne. 0) then
if(myps == ips) then
k = 0
!$omp do
do lc=1,nvc
do i = nblks_skip+n, nblks_tot-1, lcm_s_t
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/nps)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
k = k+nblk
enddo
enddo
endif
!$omp barrier
!$omp master
call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_REAL8,ips,comm_s,mpierr)
!$omp end master
!$omp barrier
k = 0
!$omp do
do lc=1,nvc
do i = nblks_skip+n, nblks_tot-1, lcm_s_t
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
k = k+nblk
enddo
enddo
endif
endif
enddo
!$omp end parallel
deallocate(aux)
......@@ -4380,6 +4395,9 @@ subroutine elpa_reduce_add_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc
!-------------------------------------------------------------------------------
use ELPA1 ! for least_common_multiple
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
......@@ -4393,6 +4411,7 @@ subroutine elpa_reduce_add_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc
integer myps, mypt, nps, npt
integer n, lc, k, i, ips, ipt, ns, nl, mpierr
integer lcm_s_t, nblks_tot
integer auxstride, tylerk, error_unit
call mpi_comm_rank(comm_s,myps,mpierr)
call mpi_comm_size(comm_s,nps ,mpierr)
......@@ -4412,34 +4431,42 @@ subroutine elpa_reduce_add_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc
allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
aux1(:) = 0
aux2(:) = 0
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
do n = 0, lcm_s_t-1
ips = mod(n,nps)
ipt = mod(n,npt)
auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)
if(myps == ips) then
k = 0
!$omp do
do lc=1,nvc
do i = n, nblks_tot-1, lcm_s_t
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/nps)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
k = k+nblk
enddo
enddo
k = nvc * auxstride
!$omp barrier
!$omp master
if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr)
!$omp end master
!$omp barrier
if(mypt == ipt) then
k = 0
!$omp do
do lc=1,nvc
do i = n, nblks_tot-1, lcm_s_t
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
k = k+nblk
enddo
enddo
endif
......@@ -4447,6 +4474,7 @@ subroutine elpa_reduce_add_vectors(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc
endif
enddo
!$omp end parallel
deallocate(aux1)
deallocate(aux2)
......
......@@ -14,6 +14,9 @@
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
......@@ -252,8 +255,8 @@ function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
nbw = (63/nblk+1)*nblk
num_blocks = (na-1)/nbw + 1
......@@ -574,6 +577,9 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
......@@ -582,8 +588,8 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: l_cols, l_rows
integer :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer :: istep, ncol, lch, lcx, nlc
integer :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer :: istep, ncol, lch, lcx, nlc, mynlc
integer :: tile_size, l_rows_tile, l_cols_tile
real*8 :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
......@@ -600,6 +606,8 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
logical, intent(in) :: useQR
integer :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_real")
#endif
......@@ -743,7 +751,55 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
! Local dot product
aux1 = 0
#ifdef WITH_OPENMP
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
enddo
enddo
endif
enddo
!$omp end parallel
#else
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
......@@ -766,7 +822,7 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
#endif
enddo
! Calculate scalar products of stored Householder vectors.
......@@ -798,36 +854,101 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
!Code for Algorithm 4
umc(1:l_cols,1:n_cols) = 0.d0
vmr(1:l_rows,n_cols+1:2*n_cols) = 0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
n_way = 1
#ifdef WITH_OPENMP
n_way = omp_get_max_threads()
#endif
!umc(1:l_cols,1:n_cols) = 0.d0
!vmr(1:l_rows,n_cols+1:2*n_cols) = 0
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
!$omp parallel private( i,lcs,lce,lrs,lre)
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
if(n_way > 1) then
!$omp do
do i=1,min(l_cols_tile, l_cols)
umc(i,1:n_cols) = 0.d0
enddo
!$omp do
do i=1,l_rows
vmr(i,n_cols+1:2*n_cols) = 0.d0
enddo
if (l_cols>0 .and. l_rows>0) then
!SYMM variant 4
!Partitioned Matrix Expression:
! Ct = Atl Bt + Atr Bb
! Cb = Atr' Bt + Abl Bb
!
!Loop invariant:
! Ct = Atl Bt + Atr Bb
!
!Update:
! C1 = A10'B0 + A11B1 + A21 B2
!
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great
!$omp do schedule(static,1)
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
lrs = i*l_rows_tile+1 ! local row start
lre = min(l_rows, (i+1)*l_rows_tile) ! local row end
!C1 += [A11 A12] [B1
! B2]
if( lre > lrs .and. l_cols > lcs ) then
call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1.d0, a(lrs,lcs), ubound(a,1), &
umc(lcs,n_cols+1), ubound(umc,1), &
0.d0, vmr(lrs,n_cols+1), ubound(vmr,1))
endif
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
enddo
! C1 += A10' B0
if( lce > lcs .and. i > 0 ) then
call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1, &
1.d0, a(1,lcs), ubound(a,1), &
vmr(1,1), ubound(vmr,1), &
0.d0, umc(lcs,1), ubound(umc,1))
endif
enddo
endif
else
umc(1:l_cols,1:n_cols) = 0.d0
vmr(1:l_rows,n_cols+1:2*n_cols) = 0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
enddo
endif
endif
!$omp end parallel
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
umc, ubound(umc,1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
umc, ubound(umc,1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if (l_cols>0) then
......@@ -858,7 +979,42 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
if(mod(n_threads, 2) == 0) then
n_way = 2
else
n_way = 1
endif
m_way = n_threads / n_way
m_id = mod(omp_get_thread_num(), m_way)
n_id = omp_get_thread_num() / m_way
do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
i = ii / tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
!Figure out this thread's range
work_per_thread = lre / m_way
if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
mystart = m_id * work_per_thread + 1
myend = mystart + work_per_thread - 1
if( myend > lre ) myend = lre
if( myend-mystart+1 < 1) cycle
call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
1.d0,a(mystart,lcs),ubound(a,1))
enddo
!$omp end parallel
#else
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
......@@ -868,7 +1024,7 @@ subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
vmr,ubound(vmr,1),umc(lcs,1),ubound(umc,1), &
1.d0,a(1,lcs),lda)
enddo
#endif
deallocate(vmr, umc, vr)
enddo
......@@ -1004,8 +1160,8 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
if (useQR) then
t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
if ( .true. ) then
t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
cwy_blocking = t_blocking * nbw
allocate(tmp1(max_local_cols*cwy_blocking))
......@@ -1027,7 +1183,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
if (useQR) then
if ( .true. ) then
do istep=1,((na-1)/nbw-1)/t_blocking + 1
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
......
......@@ -56,6 +56,11 @@ module mod_setup_mpi
#ifdef WITH_OPENMP
integer :: required_mpi_thread_level, &
provided_mpi_thread_level
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
#endif
#endif
#ifndef WITH_OPENMP
call mpi_init(mpierr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment