Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
5b55ad7a
Commit
5b55ad7a
authored
Feb 24, 2021
by
Andreas Marek
Browse files
Merge branch 'GPU_in_openmp_code_path' into 'master_pre_stage'
Gpu in openmp code path See merge request
!56
parents
4ff2f9e6
ea6daa06
Changes
14
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
src/GPU/cuUtils.cu
View file @
5b55ad7a
...
...
@@ -57,7 +57,7 @@
#include
"cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include
"cuUtils_template.cu"
...
...
@@ -71,7 +71,7 @@
#include
"cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#if
def
WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include
"cuUtils_template.cu"
...
...
src/elpa1/elpa1_auxiliary.F90
View file @
5b55ad7a
...
...
@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
...
...
@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
...
...
src/elpa1/elpa1_template.F90
View file @
5b55ad7a
...
...
@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
call
obj
%
timer
%
stop
(
"elpa_solve_evp_&
...
...
@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success
=
.false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call
obj
%
get
(
"omp_threads"
,
nrThreads
,
error
)
if
(
nrThreads
.ne.
1
)
then
print
*
,
"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print
*
,
"setting 1 openmp thread now"
call
obj
%
set
(
"omp_threads"
,
1
,
error
)
nrThreads
=
1
call
omp_set_num_threads
(
nrThreads
)
endif
#endif
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
endif
...
...
@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
5b55ad7a
...
...
@@ -612,26 +612,28 @@ subroutine tridiag_&
if
(
l_row_end
<
l_row_beg
)
cycle
#ifdef WITH_OPENMP_TRADITIONAL
if
(
mod
(
n_iter
,
n_threads
)
==
my_thread
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
.not.
useGPU
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
call
obj
%
timer
%
stop
(
"OpenMP parallel"
)
do
i
=
0
,
max_threads
-1
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
if
(
.not.
(
useGPU
))
then
do
i
=
0
,
max_threads
-1
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
...
...
src/elpa1/elpa_reduce_add_vectors.F90
View file @
5b55ad7a
...
...
@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2
(:)
=
0
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
!$omp parallel &
!$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors.F90
View file @
5b55ad7a
...
...
@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"elpa_transpose_vectors: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors_ss.F90
View file @
5b55ad7a
...
...
@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
))
check_allocate
(
"elpa_transpose_vectors_ss: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa2/compute_hh_trafo.F90
View file @
5b55ad7a
...
...
@@ -58,11 +58,10 @@ l_nev, &
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
off
,
ncols
,
istripe
,
&
#ifdef WITH_OPENMP_TRADITIONAL
my_thread
,
thread_width
,
&
my_thread
,
thread_width
,
kernel
,
last_stripe_width
)
#else
last_stripe_width
,
&
last_stripe_width
,
kernel
)
#endif
kernel
)
use
precision
use
elpa_abstract_impl
...
...
@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */
integer
(
kind
=
ik
),
intent
(
in
)
::
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
),
optional
::
last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
...
...
@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then
if (my_thread==1) then
! in the calling routine threads go form 1 .. max_threads
#endif
ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL
endif
#endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */
if (
istripe<
stripe_
count
) then
nl = stripe_width
if (
present(last_
stripe_
width)
) then
nl =
merge(
stripe_width
, last_stripe_width, istripe<stripe_count)
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
&
MATH_DATATYPE
&
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
&
MATH_DATATYPE
&
#ifdef WITH_OPENMP_TRADITIONAL
&
_
openmp
" // &
&
_
openmp
" // &
#else
&"
//
&
&"
//
&
#endif
&
PRECISION_SUFFIX
&
)
&
PRECISION_SUFFIX
&
)
return
return
endif
endif
endif
#endif /* not WITH_OPENMP_TRADITIONAL */
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
5b55ad7a
...
...
@@ -629,65 +629,65 @@ max_threads)
aux1
=
0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc
=
0
! number of local columns
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a_mat
(
1
:
lr
,
lcx
))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nlc
>
0
)
call
mpi_allreduce
(
aux1
,
aux2
,
int
(
nlc
,
kind
=
MPI_KIND
),
MPI_COMPLEX_PRECISION
,
MPI_SUM
,
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
#endif /* WITH_MPI */
!#if 0
! ! original complex implementation without openmp. check performance
! nlc = 0 ! number of local columns
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
! endif
! enddo
!
! ! Transform
! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
...
...
@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix
#if 0
! original complex implemetation check for performance
umcCPU
(
1
:
l_cols
,
1
:
n_cols
)
=
0.0_rck
vmrCPU
(
1
:
l_rows
,
n_cols
+1
:
2
*
n_cols
)
=
0.0_rck
if
(
l_cols
>
0
.and.
l_rows
>
0
)
then
do
i
=
0
,(
istep
*
nbw
-1
)/
tile_size
lcs
=
i
*
l_cols_tile
+1
lce
=
min
(
l_cols
,(
i
+1
)
*
l_cols_tile
)
if
(
lce
<
lcs
)
cycle
lre
=
min
(
l_rows
,(
i
+1
)
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'C'
,
'N'
,
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a_mat
(
1
,
lcs
),
ubound
(
a_mat
,
dim
=
1
),
&
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
ONE
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
if
(
i
==
0
)
cycle
lre
=
min
(
l_rows
,
i
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a_mat
(
1
,
lcs
),
lda
,
&
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
ONE
,
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
enddo
endif
! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!
#if 0
!
! original complex implemetation check for performance
!
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
!
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
!
if (l_cols>0 .and. l_rows>0) then
!
do i=0,(istep*nbw-1)/tile_size
!
!
lcs = i*l_cols_tile+1
!
lce = min(l_cols,(i+1)*l_cols_tile)
!
if (lce<lcs) cycle
!
!
lre = min(l_rows,(i+1)*l_rows_tile)
!
!
call obj%timer%start("blas")
!
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
!
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
!
call obj%timer%stop("blas")
!
!
if (i==0) cycle
!
lre = min(l_rows,i*l_rows_tile)
!
call obj%timer%start("blas")
!
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
!
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
!
call obj%timer%stop("blas")
!
enddo
!
!
endif ! (l_cols>0 .and. l_rows>0)
!
#endif /* if 0 */
!Code for Algorithm 4
...
...
@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
!$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads
=
omp_get_num_threads
()
if
(
mod
(
n_threads
,
2
)
==
0
)
then
...
...
@@ -1424,13 +1428,30 @@ max_threads)
myend
=
mystart
+
work_per_thread
-
1
if
(
myend
>
lre
)
myend
=
lre
if
(
myend
-
mystart
+1
<
1
)
cycle
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
useGPU
)
then
if
(
n_way
.gt.
1
)
then
print
*
,
"error more than 1 openmp thread used in GPU part of elpa2_bandred"
print
*
,
"this should never happen"
stop
endif
call
obj
%
timer
%
start
(
"cublas"
)
call
cublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
myend
-
mystart
+1
,
&
lce
-
lcs
+1
,
2
*
n_cols
,
-
ONE
,
&
vmr_dev
,
cur_l_rows
,
(
umc_dev
+
(
lcs
-1
)
*
&
size_of_datatype
),
&
cur_l_cols
,
ONE
,
(
a_dev
+
(
lcs
-1
)
*
lda
*
&
size_of_datatype
),
lda
)
call
obj
%
timer
%
stop
(
"cublas"
)
else
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
enddo
!$omp end parallel
...
...
src/elpa2/elpa2_template.F90
View file @
5b55ad7a
...
...
@@ -391,6 +391,18 @@
success
=
.false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call
obj
%
get
(
"omp_threads"
,
nrThreads
,
error
)
if
(
nrThreads
.ne.
1
)
then
print
*
,
"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print
*
,
"setting 1 openmp thread now"
call
obj
%
set
(
"omp_threads"
,
1
,
error
)
nrThreads
=
1
call
omp_set_num_threads
(
nrThreads
)
endif
#endif
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
endif
...
...
@@ -518,7 +530,7 @@
endif
#endif
#endif
#endif
/* REALCASE == 1 */
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
...
...
@@ -614,9 +626,7 @@
else
useQR
=
.false.
endif
#endif
#endif /* REALCASE == 1 */
#if REALCASE == 1
useQRActual
=
.false.
...
...
@@ -1009,8 +1019,6 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
...
...
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
5b55ad7a
This diff is collapsed.
Click to expand it.
src/elpa2/elpa2_tridiag_band_template.F90
View file @
5b55ad7a
...
...
@@ -528,8 +528,13 @@ subroutine tridiag_band_&
! with MPI calls
call
obj
%
timer
%
start
(
"OpenMP parallel"
//
PRECISION_SUFFIX
)
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
!$omp parallel do &
!$omp default(none) &
!$omp private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i) &
!$omp shared(max_threads, obj, ab, isSkewsymmetric, wantDebug, hh_gath, &
!$omp hh_cnt, tau_t, hv_t, na, istep, n_off, na_s, nb, omp_block_limits, iter) &
!$omp schedule(static,1), num_threads(max_threads)
do
my_thread
=
1
,
max_threads
if
(
iter
==
1
)
then
...
...
@@ -1087,7 +1092,7 @@ subroutine tridiag_band_&
endif
#endif
#if WITH_OPENMP_TRADITIONAL
#if
def
WITH_OPENMP_TRADITIONAL
do
iblk
=
1
,
nblocks
if
(
hh_dst
(
iblk
)
>=
np_rows
)
exit
...
...
src/elpa2/qr/qr_utils.F90
View file @
5b55ad7a
...
...
@@ -57,7 +57,7 @@ module qr_utils_mod
public
::
reverse_matrix_1dcomm_double
public
::
reverse_matrix_2dcomm_ref_double
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
public
::
reverse_vector_local_single
public
::
reverse_matrix_local_single