Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
0d08507c
Commit
0d08507c
authored
Aug 10, 2020
by
Andreas Marek
Browse files
Rename OPENMP preprocessor macro
parent
1aa89171
Changes
34
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
0d08507c
...
...
@@ -93,9 +93,9 @@ AC_ARG_ENABLE([openmp],
],
[enable_openmp=no])
AC_MSG_RESULT([${enable_openmp}])
AM_CONDITIONAL([WITH_OPENMP],[test x"$enable_openmp" = x"yes"])
AM_CONDITIONAL([WITH_OPENMP
_TRADITIONAL
],[test x"$enable_openmp" = x"yes"])
if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AC_DEFINE([WITH_OPENMP
_TRADITIONAL
], [1], [use OpenMP threading])
fi
...
...
src/elpa1/elpa1_merge_systems_real_template.F90
View file @
0d08507c
...
...
@@ -64,7 +64,7 @@ subroutine merge_systems_&
use
elpa_abstract_impl
use
elpa_blas_interfaces
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
use
omp_lib
#endif
implicit
none
...
...
@@ -93,7 +93,7 @@ subroutine merge_systems_&
dbase
(
na
),
ddiff
(
na
),
ev_scale
(
na
),
tmp
(
na
)
real
(
kind
=
REAL_DATATYPE
)
::
d1u
(
na
),
zu
(
na
),
d1l
(
na
),
zl
(
na
)
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
qtmp1
(:,:),
qtmp2
(:,:),
ev
(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
z_p
(:,:)
#endif
...
...
@@ -122,7 +122,7 @@ subroutine merge_systems_&
&
PRECISION
&
&
_
real
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
integer
(
kind
=
ik
)
::
my_thread
allocate
(
z_p
(
na
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
@@ -442,7 +442,7 @@ subroutine merge_systems_&
! Solve secular equation
z
(
1
:
na1
)
=
1
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
z_p
(
1
:
na1
,:)
=
1
#endif
dbase
(
1
:
na1
)
=
0
...
...
@@ -450,7 +450,7 @@ subroutine merge_systems_&
info
=
0
infoBLAS
=
int
(
info
,
kind
=
BLAS_KIND
)
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP
_TRADITIONAL
!
! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
...
...
@@ -474,7 +474,7 @@ subroutine merge_systems_&
! Compute updated z
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP
_TRADITIONAL
! do j=1,na1
! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
! enddo
...
...
@@ -500,7 +500,7 @@ subroutine merge_systems_&
ddiff
(
i
)
=
delta
(
i
)
endif
enddo
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP
_TRADITIONAL
!!$OMP END PARALLEL
!
! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
...
...
@@ -526,7 +526,7 @@ subroutine merge_systems_&
! Calculate scale factors for eigenvectors
ev_scale
(:)
=
0.0_rk
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
call
obj
%
timer
%
start
(
"OpenMP parallel"
//
PRECISION_SUFFIX
)
...
...
@@ -548,7 +548,7 @@ subroutine merge_systems_&
&(
obj
,
d1
,
dbase
,
ddiff
,
z
,
ev_scale
(
i
),
na1
,
i
)
! ev_scale(i) = ev_scale_val
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$OMP END PARALLEL DO
call
obj
%
timer
%
stop
(
"OpenMP parallel"
//
PRECISION_SUFFIX
)
...
...
@@ -888,7 +888,7 @@ subroutine merge_systems_&
deallocate
(
ev
,
qtmp1
,
qtmp2
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"merge_systems: ev, qtmp1, qtmp2"
,
istat
,
errorMessage
)
endif
!very outer test (na1==1 .or. na1==2)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
deallocate
(
z_p
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"merge_systems: z_p"
,
istat
,
errorMessage
)
#endif
...
...
src/elpa1/elpa1_template.F90
View file @
0d08507c
...
...
@@ -201,7 +201,7 @@ function elpa_solve_evp_&
call
mpi_comm_rank
(
int
(
mpi_comm_all
,
kind
=
MPI_KIND
),
my_peMPI
,
mpierr
)
my_pe
=
int
(
my_peMPI
,
kind
=
c_int
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller
=
omp_get_max_threads
()
...
...
@@ -263,7 +263,7 @@ function elpa_solve_evp_&
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
@@ -562,7 +562,7 @@ function elpa_solve_evp_&
call
nvtxRangePop
()
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
0d08507c
...
...
@@ -145,7 +145,7 @@ subroutine tridiag_&
integer
(
kind
=
c_intptr_t
)
::
a_offset
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
integer
(
kind
=
ik
)
::
my_thread
,
n_threads
,
n_iter
#endif
...
...
@@ -170,7 +170,7 @@ subroutine tridiag_&
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
uv_stored_cols
(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
...
...
@@ -355,7 +355,7 @@ subroutine tridiag_&
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
allocate
(
ur_p
(
max_local_rows
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"ur_p"
,
istat
,
errorMessage
)
...
...
@@ -363,7 +363,7 @@ subroutine tridiag_&
allocate
(
uc_p
(
max_local_cols
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"uc_p"
,
istat
,
errorMessage
)
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
tmp
=
0
v_row
=
0
...
...
@@ -579,7 +579,7 @@ subroutine tridiag_&
check_memcpy_cuda
(
"tridiag: v_row_dev"
,
successCUDA
)
endif
! useGU
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
call
obj
%
timer
%
start
(
"OpenMP parallel"
)
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
...
...
@@ -592,7 +592,7 @@ subroutine tridiag_&
! first calculate A*v part of (A + VU**T + UV**T)*v
uc_p
(
1
:
l_cols
,
my_thread
)
=
0.
ur_p
(
1
:
l_rows
,
my_thread
)
=
0.
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
do
i
=
0
,
(
istep
-2
)/
tile_size
l_col_beg
=
i
*
l_cols_per_tile
+1
l_col_end
=
min
(
l_cols
,(
i
+1
)
*
l_cols_per_tile
)
...
...
@@ -601,7 +601,7 @@ subroutine tridiag_&
l_row_beg
=
j
*
l_rows_per_tile
+1
l_row_end
=
min
(
l_rows
,(
j
+1
)
*
l_rows_per_tile
)
if
(
l_row_end
<
l_row_beg
)
cycle
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
if
(
mod
(
n_iter
,
n_threads
)
==
my_thread
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
...
...
@@ -628,7 +628,7 @@ subroutine tridiag_&
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
n_iter
=
n_iter
+1
#else /* WITH_OPENMP */
#else /* WITH_OPENMP
_TRADITIONAL
*/
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
...
...
@@ -658,7 +658,7 @@ subroutine tridiag_&
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
! not useGPU
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
enddo
! j=0,i
enddo
! i=0,(istep-2)/tile_size
...
...
@@ -738,7 +738,7 @@ subroutine tridiag_&
check_memcpy_cuda
(
"tridiag: u_row_dev 1"
,
successCUDA
)
endif
! useGPU
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$OMP END PARALLEL
call
obj
%
timer
%
stop
(
"OpenMP parallel"
)
...
...
@@ -746,7 +746,7 @@ subroutine tridiag_&
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
if
(
n_stored_vecs
>
0
)
then
...
...
src/elpa1/elpa_cholesky_template.F90
View file @
0d08507c
...
...
@@ -82,7 +82,7 @@
&PRECISION&
&"
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller
=
omp_get_max_threads
()
...
...
@@ -330,7 +330,7 @@
enddo
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
src/elpa1/elpa_reduce_add_vectors.F90
View file @
0d08507c
...
...
@@ -75,7 +75,7 @@ subroutine elpa_reduce_add_vectors_&
!-------------------------------------------------------------------------------
use
precision
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
use
omp_lib
#endif
use
elpa_mpi
...
...
@@ -132,7 +132,7 @@ subroutine elpa_reduce_add_vectors_&
check_allocate
(
"elpa_reduce_add: aux2"
,
istat
,
errorMessage
)
aux1
(:)
=
0
aux2
(:)
=
0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
...
...
@@ -147,7 +147,7 @@ subroutine elpa_reduce_add_vectors_&
if
(
myps
==
ips
)
then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp do
#endif
do
lc
=
1
,
nvc
...
...
@@ -161,7 +161,7 @@ subroutine elpa_reduce_add_vectors_&
enddo
k
=
nvc
*
auxstride
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp barrier
!$omp master
#endif
...
...
@@ -184,13 +184,13 @@ subroutine elpa_reduce_add_vectors_&
if
(
k
>
0
)
aux2
=
aux1
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end master
!$omp barrier
#endif
if
(
mypt
==
ipt
)
then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp do
#endif
do
lc
=
1
,
nvc
...
...
@@ -207,7 +207,7 @@ subroutine elpa_reduce_add_vectors_&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end parallel
#endif
...
...
src/elpa1/elpa_solve_tridi_impl_public.F90
View file @
0d08507c
...
...
@@ -92,7 +92,7 @@
matrixRows
=
obj
%
local_nrows
matrixCols
=
obj
%
local_ncols
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller
=
omp_get_max_threads
()
...
...
@@ -135,7 +135,7 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
src/elpa1/elpa_transpose_vectors.F90
View file @
0d08507c
...
...
@@ -87,7 +87,7 @@ subroutine ROUTINE_NAME&
!-------------------------------------------------------------------------------
use
precision
use
elpa_abstract_impl
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
use
omp_lib
#endif
use
elpa_mpi
...
...
@@ -147,7 +147,7 @@ subroutine ROUTINE_NAME&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"elpa_transpose_vectors: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
@@ -163,7 +163,7 @@ subroutine ROUTINE_NAME&
if
(
nblks_comm
.ne.
0
)
then
if
(
myps
==
ips
)
then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp do
#endif
do
lc
=
1
,
nvc
...
...
@@ -177,7 +177,7 @@ subroutine ROUTINE_NAME&
enddo
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp barrier
!$omp master
#endif
...
...
@@ -198,7 +198,7 @@ subroutine ROUTINE_NAME&
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end master
!$omp barrier
...
...
@@ -222,7 +222,7 @@ subroutine ROUTINE_NAME&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end parallel
#endif
deallocate
(
aux
,
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
src/elpa1/elpa_transpose_vectors_ss.F90
View file @
0d08507c
...
...
@@ -78,7 +78,7 @@ subroutine elpa_transpose_vectors_ss_&
!-------------------------------------------------------------------------------
use
precision
use
elpa_abstract_impl
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
use
omp_lib
#endif
use
elpa_mpi
...
...
@@ -130,7 +130,7 @@ subroutine elpa_transpose_vectors_ss_&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
))
check_allocate
(
"elpa_transpose_vectors_ss: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
@@ -146,7 +146,7 @@ subroutine elpa_transpose_vectors_ss_&
if
(
nblks_comm
.ne.
0
)
then
if
(
myps
==
ips
)
then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp do
#endif
do
lc
=
1
,
nvc
...
...
@@ -160,7 +160,7 @@ subroutine elpa_transpose_vectors_ss_&
enddo
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp barrier
!$omp master
#endif
...
...
@@ -181,7 +181,7 @@ subroutine elpa_transpose_vectors_ss_&
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end master
!$omp barrier
...
...
@@ -201,7 +201,7 @@ subroutine elpa_transpose_vectors_ss_&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp end parallel
#endif
deallocate
(
aux
,
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
src/elpa2/compute_hh_trafo.F90
View file @
0d08507c
This diff is collapsed.
Click to expand it.
src/elpa2/elpa2_bandred_template.F90
View file @
0d08507c
...
...
@@ -103,7 +103,7 @@
use
cuda_functions
use
iso_c_binding
use
elpa1_compute
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
use
omp_lib
#endif
use
precision
...
...
@@ -140,7 +140,7 @@
#if REALCASE == 1
integer
(
kind
=
ik
)
::
vmrCols
#endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
integer
(
kind
=
ik
)
::
mynlc
,
lrs
,
transformChunkSize
#endif
integer
(
kind
=
ik
)
::
i
,
j
,
lcs
,
lce
,
lre
,
lc
,
lr
,
cur_pcol
,
n_cols
,
nrow
...
...
@@ -628,7 +628,7 @@
aux1
=
0.0_rck
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc
=
0
! number of local columns
...
...
@@ -750,7 +750,7 @@
enddo
!$omp end parallel
#else /* WITH_OPENMP */
#else /* WITH_OPENMP
_TRADITIONAL
*/
nlc
=
0
! number of local columns
do
j
=
1
,
lc
-1
...
...
@@ -785,7 +785,7 @@
#endif
endif
enddo
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
enddo
! lc
if
(
useGPU_reduction_lower_block_to_tridiagonal
)
then
...
...
@@ -939,7 +939,7 @@
! n_way is actually a branch for the number of OpenMP threads
n_way
=
1
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
#if REALCASE == 1
n_way
=
max_threads
...
...
@@ -1022,7 +1022,7 @@
endif
! l_cols>0 .and. l_rows>0
else
! n_way > 1
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
if
(
.not.
useGPU
)
then
umcCPU
(
1
:
l_cols
,
1
:
n_cols
)
=
0.0_rck
...
...
@@ -1137,7 +1137,7 @@
endif
! useGPU
endif
! l_cols>0 .and. l_rows>0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
endif
! n_way > 1
#if REALCASE == 1
!$omp end parallel
...
...
@@ -1394,7 +1394,7 @@
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads
=
omp_get_num_threads
()
...
...
@@ -1433,7 +1433,7 @@
enddo
!$omp end parallel
#else /* WITH_OPENMP */
#else /* WITH_OPENMP
_TRADITIONAL
*/
do
i
=
0
,(
istep
*
nbw
-1
)/
tile_size
lcs
=
i
*
l_cols_tile
+1
...
...
@@ -1464,7 +1464,7 @@
call
obj
%
timer
%
stop
(
"blas"
)
endif
! useGPU
enddo
! i=0,(istep*nbw-1)/tile_size
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP
_TRADITIONAL
*/
if
(
.not.
(
useGPU
))
then
if
(
allocated
(
vr
))
then
...
...
src/elpa2/elpa2_print_kernels.F90
View file @
0d08507c
...
...
@@ -88,7 +88,7 @@ program print_available_elpa2_kernels
print
*
,
"information if (and how) the kernels can be choosen at "
print
*
,
"runtime"
print
*
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
print
*
,
" ELPA supports threads: yes"
#else
print
*
,
" ELPA supports threads: no"
...
...
src/elpa2/elpa2_template.F90
View file @
0d08507c
...
...
@@ -223,7 +223,7 @@
reDistributeMatrix
=
.false.
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller
=
omp_get_max_threads
()
...
...
@@ -305,7 +305,7 @@
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
@@ -972,7 +972,7 @@
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
...
...
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
0d08507c
...
...
@@ -94,7 +94,7 @@
use
cuda_functions
use
precision
use
iso_c_binding
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
! use omp_lib
#endif
implicit
none
...
...
@@ -120,7 +120,7 @@
integer
(
kind
=
ik
)
::
next_n
,
next_local_n
,
next_n_start
,
next_n_end
integer
(
kind
=
ik
)
::
bottom_msg_length
,
top_msg_length
,
next_top_msg_length
integer
(
kind
=
ik
)
::
stripe_width
,
last_stripe_width
,
stripe_count
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
integer
(
kind
=
ik
)
::
thread_width
,
csw
,
b_off
,
b_len
#endif
integer
(
kind
=
ik
)
::
num_result_blocks
,
num_result_buffers
,
num_bufs_recvd
...
...
@@ -129,7 +129,7 @@
integer
(
kind
=
MPI_KIND
)
::
mpierr
logical
::
flag
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:,:)
#else
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:)
...
...
@@ -141,7 +141,7 @@
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
row
(:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
row_group
(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_recv_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:)
...
...
@@ -184,7 +184,7 @@
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
integer
(
kind
=
ik
)
::
my_thread
#endif
...
...
@@ -266,7 +266,7 @@
l_nev
=
local_index
(
nev
,
my_pcol
,
np_cols
,
nblk
,
-1
)
if
(
l_nev
==
0
)
then
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
_TRADITIONAL
thread_width
=
0
#endif
stripe_width
=
0
...
...
@@ -275,7 +275,7 @@
else
! l_nev
#if WITH_OPENMP
#if WITH_OPENMP
_TRADITIONAL
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
...
...
@@ -382,7 +382,7 @@
endif
! useGPU
#else /* WITH_OPENMP */
#else /* WITH_OPENMP
_TRADITIONAL
*/
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
...
...
@@ -481,7 +481,7 @@
last_stripe_width
=
l_nev
-
(
stripe_count
-1
)
*
stripe_width