Commit 2d7c8e91 authored by Andreas Marek's avatar Andreas Marek
Browse files

Refactor OpenMP region in elpa2_bandred

parent 45093e38
...@@ -165,8 +165,9 @@ ...@@ -165,8 +165,9 @@
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows integer(kind=ik) :: l_cols, l_rows
#if REALCASE == 1 #if REALCASE == 1
integer(kind=ik) :: vmrCols, mynlc integer(kind=ik) :: vmrCols
#endif #endif
integer(kind=ik) :: mynlc
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
...@@ -223,10 +224,8 @@ ...@@ -223,10 +224,8 @@
#if REALCASE == 1 #if REALCASE == 1
logical, intent(in) :: useQR logical, intent(in) :: useQR
#endif #endif
#if REALCASE == 1
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, & integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize ii, pp, transformChunkSize
#endif
call timer%start("bandred_& call timer%start("bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -810,66 +809,9 @@ ...@@ -810,66 +809,9 @@
#endif #endif
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#if 0
#if REALCASE == 1 ! original complex implementation without openmp. check performance
!Open up one omp region to avoid paying openmp overhead. nlc = 0 ! number of local columns
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
enddo
enddo
endif
enddo
!$omp end parallel
#endif /* REALCASE == 1 */
#if COMPLEXCASE == 1
nlc = 0 ! number of local columns
do j=1,lc-1 do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then if (lcx>0) then
...@@ -895,6 +837,7 @@ ...@@ -895,6 +837,7 @@
endif endif
enddo enddo
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
#else /* WITH_MPI */ #else /* WITH_MPI */
...@@ -921,10 +864,77 @@ ...@@ -921,10 +864,77 @@
! if (lcx>0) then ! if (lcx>0) then
! nlc = nlc+1 ! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) ! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif ! endif
! enddo ! enddo
#endif /* if 0 */
#endif /* COMPLEXCASE */ !Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
#if REALCASE == 1
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
enddo
enddo
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment