Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
2d7c8e91
Commit
2d7c8e91
authored
Feb 02, 2017
by
Andreas Marek
Browse files
Refactor OpenMP region in elpa2_bandred
parent
45093e38
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2_bandred_template.X90
View file @
2d7c8e91
...
...
@@ -165,8 +165,9 @@
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
#if REALCASE == 1
integer(kind=ik) :: vmrCols
, mynlc
integer(kind=ik) :: vmrCols
#endif
integer(kind=ik) :: mynlc
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
...
...
@@ -223,10 +224,8 @@
#if REALCASE == 1
logical, intent(in) :: useQR
#endif
#if REALCASE == 1
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize
#endif
call timer%start("bandred_&
&MATH_DATATYPE&
...
...
@@ -810,66 +809,9 @@
#endif
#ifdef WITH_OPENMP
#if REALCASE == 1
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
enddo
enddo
endif
enddo
!$omp end parallel
#endif /* REALCASE == 1 */
#if COMPLEXCASE == 1
nlc = 0 ! number of local columns
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
...
...
@@ -895,6 +837,7 @@
endif
enddo
call timer%stop("mpi_communication")
#else /* WITH_MPI */
...
...
@@ -921,10 +864,77 @@
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
#endif /* COMPLEXCASE */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
#if REALCASE == 1
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
enddo
enddo
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment