diff --git a/src/elpa1/elpa1_template.F90 b/src/elpa1/elpa1_template.F90 index 5431b00a89917769599036a2a130113b88d1132e..acae4fb1270b48e5dd696bd3fe1966f4550e5431 100644 --- a/src/elpa1/elpa1_template.F90 +++ b/src/elpa1/elpa1_template.F90 @@ -66,6 +66,9 @@ function elpa_solve_evp_& use elpa_abstract_impl use elpa_mpi use elpa1_compute +#ifdef WITH_OPENMP + use omp_lib +#endif implicit none #include "../general/precision_kinds.F90" class(elpa_abstract_impl_t), intent(inout) :: obj @@ -110,7 +113,7 @@ function elpa_solve_evp_& mpi_comm_all, check_pd, i, error logical :: do_bandred, do_solve, do_trans_ev - integer(kind=ik) :: nrThreads, omp_get_num_threads + integer(kind=ik) :: nrThreads call obj%timer%start("elpa_solve_evp_& &MATH_DATATYPE& @@ -119,7 +122,7 @@ function elpa_solve_evp_& &") #ifdef WITH_OPENMP - nrThreads = omp_get_num_threads() + nrThreads = omp_get_max_threads() #else nrThreads = 1 #endif @@ -310,7 +313,7 @@ function elpa_solve_evp_& if (obj%eigenvalues_only) then do_trans_ev = .true. endif - + print *,"ELPA 1 ",nrThreads if (do_bandred) then call obj%timer%start("forward") call tridiag_& diff --git a/src/elpa1/elpa1_tridiag_template.F90 b/src/elpa1/elpa1_tridiag_template.F90 index ba8ca58341257e19df4a331f35378d43761211dd..1bb66caa161f1b3478ae832b7d88a54cb78e5742 100644 --- a/src/elpa1/elpa1_tridiag_template.F90 +++ b/src/elpa1/elpa1_tridiag_template.F90 @@ -535,11 +535,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ my_thread = omp_get_thread_num() n_threads = omp_get_num_threads() - ! debug REMOVE again - print *,"debug" - if (n_threads .ne. max_threads) then - print *,"WTF?" - endif n_iter = 0 diff --git a/src/elpa1/elpa_cholesky_template.F90 b/src/elpa1/elpa_cholesky_template.F90 index 73709eb79b888f5ae95a86890f5bd8ac853f6f02..7a9ef3992dbeecdd6db8ac68102938ff4ff4c16e 100644 --- a/src/elpa1/elpa_cholesky_template.F90 +++ b/src/elpa1/elpa_cholesky_template.F90 @@ -71,7 +71,7 @@ logical :: success integer(kind=ik) :: istat, debug, error character(200) :: errorMessage - integer(kind=ik) :: max_threads + integer(kind=ik) :: nrThreads call obj%timer%start("elpa_cholesky_& &MATH_DATATYPE& @@ -80,9 +80,9 @@ &") #ifdef WITH_OPENMP - max_threads=omp_get_num_threads() + nrThreads=omp_get_max_threads() #else - max_threads=1 + nrThreads=1 #endif na = obj%na @@ -295,7 +295,7 @@ &PRECISION & (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & - n, na, nblk, nblk, max_threads) + n, na, nblk, nblk, nrThreads) do i=0,(na-1)/tile_size lcs = max(l_colx,i*l_cols_tile+1) diff --git a/src/elpa1/elpa_solve_tridi_impl_public.F90 b/src/elpa1/elpa_solve_tridi_impl_public.F90 index 3d3a83172839f9e23c3fb0b05aecd40008b827c2..0a8c556224e419408af2961d67f8bf34fa07ad76 100644 --- a/src/elpa1/elpa_solve_tridi_impl_public.F90 +++ b/src/elpa1/elpa_solve_tridi_impl_public.F90 @@ -95,7 +95,7 @@ matrixCols = obj%local_ncols #ifdef WITH_OPENMP - max_threads=omp_get_num_threads() + max_threads=omp_get_max_threads() #else max_threads=1 #endif diff --git a/src/elpa2/elpa2_bandred_template.F90 b/src/elpa2/elpa2_bandred_template.F90 index 16351f464a5e171a241c14ea93f2b757ae1a9f50..bba16638539f829522d690c99f652e0706900c50 100644 --- a/src/elpa2/elpa2_bandred_template.F90 +++ b/src/elpa2/elpa2_bandred_template.F90 @@ -1463,13 +1463,9 @@ ! A = A - V*U**T - U*V**T #ifdef WITH_OPENMP - ! OPENMP_CHANGE here !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) n_threads = omp_get_num_threads() - print *,"debug" - if (n_threads .ne. max_threads) then - print *,"WTF2" - endif + if (mod(n_threads, 2) == 0) then n_way = 2 else diff --git a/src/elpa2/elpa2_template.F90 b/src/elpa2/elpa2_template.F90 index 5e871232c5c39cae735123738cfdad67556d9c22..61ec39d0fd46870bf1a0d10cb5fe2b082afb6cf5 100644 --- a/src/elpa2/elpa2_template.F90 +++ b/src/elpa2/elpa2_template.F90 @@ -490,7 +490,7 @@ &_& &PRECISION& (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & - do_useGPU, wantDebug) + do_useGPU, wantDebug, nrThreads) #ifdef WITH_MPI call obj%timer%start("mpi_communication") diff --git a/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 b/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 index ea98ac92d36ed34570816d187eb1c04de6c36d89..48970cc11b07e645d01e4beac786e62acdefbbc9 100644 --- a/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 +++ b/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 @@ -97,7 +97,7 @@ use precision use iso_c_binding #ifdef WITH_OPENMP - use omp_lib + ! use omp_lib #endif implicit none #include "../general/precision_kinds.F90" @@ -219,11 +219,6 @@ kernel_time = 0.0 kernel_flops = 0 -!#ifdef WITH_OPENMP -! ! openmp_change_here -! max_threads = 1 -! max_threads = omp_get_max_threads() -!#endif if (wantDebug) call obj%timer%start("mpi_communication") call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) diff --git a/src/elpa2/elpa2_tridiag_band_template.F90 b/src/elpa2/elpa2_tridiag_band_template.F90 index 8418b093b9cbae6a49406025fb448cff85ce0d2b..0f32b4e25edee4b944051462739345acf237f781 100644 --- a/src/elpa2/elpa2_tridiag_band_template.F90 +++ b/src/elpa2/elpa2_tridiag_band_template.F90 @@ -56,7 +56,7 @@ &_& &PRECISION & (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, & - hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug) + hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads) !------------------------------------------------------------------------------- ! tridiag_band_real/complex: ! Reduces a real symmetric band matrix to tridiagonal form @@ -89,6 +89,9 @@ use precision use iso_c_binding use redist +#ifdef WITH_OPENMP + use omp_lib +#endif implicit none #include "../general/precision_kinds.F90" class(elpa_abstract_impl_t), intent(inout) :: obj @@ -112,16 +115,14 @@ integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off + integer(kind=ik), intent(in) :: nrThreads #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter #ifdef WITH_MPI -! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE) #endif -! integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:) integer(kind=ik), allocatable :: global_id_tmp(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) MATH_DATATYPE(kind=rck), allocatable :: hv_t(:,:), tau_t(:) - integer(kind=ik) :: omp_get_max_threads #endif /* WITH_OPENMP */ integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) @@ -379,15 +380,7 @@ #ifdef WITH_OPENMP ! OpenMP work distribution: - - max_threads = 1 -#if REALCASE == 1 - ! OPENMP_CHANGE here - max_threads = omp_get_max_threads() -#endif -#if COMPLEXCASE == 1 -!$ max_threads = omp_get_max_threads() -#endif + max_threads = nrThreads ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1