Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
5b55ad7a
Commit
5b55ad7a
authored
Feb 24, 2021
by
Andreas Marek
Browse files
Merge branch 'GPU_in_openmp_code_path' into 'master_pre_stage'
Gpu in openmp code path See merge request
!56
parents
4ff2f9e6
ea6daa06
Changes
14
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
src/GPU/cuUtils.cu
View file @
5b55ad7a
...
@@ -57,7 +57,7 @@
...
@@ -57,7 +57,7 @@
#include
"cuUtils_template.cu"
#include
"cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include
"cuUtils_template.cu"
#include
"cuUtils_template.cu"
...
@@ -71,7 +71,7 @@
...
@@ -71,7 +71,7 @@
#include
"cuUtils_template.cu"
#include
"cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#if
def
WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include
"cuUtils_template.cu"
#include
"cuUtils_template.cu"
...
...
src/elpa1/elpa1_auxiliary.F90
View file @
5b55ad7a
...
@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
...
@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION
#undef REALCASE
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define REALCASE 1
#define SINGLE_PRECISION
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
#include "../general/precision_macros.h"
...
@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
...
@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION
#undef REALCASE
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define REALCASE 1
#define SINGLE_PRECISION
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
#include "../general/precision_macros.h"
...
...
src/elpa1/elpa1_template.F90
View file @
5b55ad7a
...
@@ -279,8 +279,6 @@ function elpa_solve_evp_&
...
@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
#endif
call
obj
%
timer
%
stop
(
"elpa_solve_evp_&
call
obj
%
timer
%
stop
(
"elpa_solve_evp_&
...
@@ -356,6 +354,18 @@ function elpa_solve_evp_&
...
@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success
=
.false.
success
=
.false.
return
return
endif
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call
obj
%
get
(
"omp_threads"
,
nrThreads
,
error
)
if
(
nrThreads
.ne.
1
)
then
print
*
,
"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print
*
,
"setting 1 openmp thread now"
call
obj
%
set
(
"omp_threads"
,
1
,
error
)
nrThreads
=
1
call
omp_set_num_threads
(
nrThreads
)
endif
#endif
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
endif
endif
...
@@ -589,8 +599,6 @@ function elpa_solve_evp_&
...
@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif
#endif
! restore original OpenMP settings
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
#endif
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
5b55ad7a
...
@@ -612,26 +612,28 @@ subroutine tridiag_&
...
@@ -612,26 +612,28 @@ subroutine tridiag_&
if
(
l_row_end
<
l_row_beg
)
cycle
if
(
l_row_end
<
l_row_beg
)
cycle
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
if
(
mod
(
n_iter
,
n_threads
)
==
my_thread
)
then
if
(
mod
(
n_iter
,
n_threads
)
==
my_thread
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
if
(
.not.
useGPU
)
then
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
if
(
i
/
=
j
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
if
(
isSkewsymmetric
)
then
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
else
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
endif
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
@@ -750,11 +752,12 @@ subroutine tridiag_&
...
@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
!$OMP END PARALLEL
call
obj
%
timer
%
stop
(
"OpenMP parallel"
)
call
obj
%
timer
%
stop
(
"OpenMP parallel"
)
if
(
.not.
(
useGPU
))
then
do
i
=
0
,
max_threads
-1
do
i
=
0
,
max_threads
-1
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
...
...
src/elpa1/elpa_reduce_add_vectors.F90
View file @
5b55ad7a
...
@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
...
@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2
(:)
=
0
aux2
(:)
=
0
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!call omp_set_num_threads(nrThreads)
!$omp parallel &
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
!$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif
#endif
do
n
=
0
,
lcm_s_t
-1
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors.F90
View file @
5b55ad7a
...
@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
...
@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"elpa_transpose_vectors: aux"
,
istat
,
errorMessage
)
check_allocate
(
"elpa_transpose_vectors: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
#endif
do
n
=
0
,
lcm_s_t
-1
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors_ss.F90
View file @
5b55ad7a
...
@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
...
@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
))
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
))
check_allocate
(
"elpa_transpose_vectors_ss: aux"
,
istat
,
errorMessage
)
check_allocate
(
"elpa_transpose_vectors_ss: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
#endif
do
n
=
0
,
lcm_s_t
-1
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa2/compute_hh_trafo.F90
View file @
5b55ad7a
...
@@ -58,11 +58,10 @@ l_nev, &
...
@@ -58,11 +58,10 @@ l_nev, &
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
off
,
ncols
,
istripe
,
&
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
off
,
ncols
,
istripe
,
&
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
my_thread
,
thread_width
,
&
my_thread
,
thread_width
,
kernel
,
last_stripe_width
)
#else
#else
last_stripe_width
,
&
last_stripe_width
,
kernel
)
#endif
#endif
kernel
)
use
precision
use
precision
use
elpa_abstract_impl
use
elpa_abstract_impl
...
@@ -141,6 +140,7 @@ kernel)
...
@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */
#else /* WITH_OPENMP_TRADITIONAL */
integer
(
kind
=
ik
),
intent
(
in
)
::
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
)
::
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
),
optional
::
last_stripe_width
#if REALCASE == 1
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
...
@@ -221,54 +221,39 @@ kernel)
...
@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then
if (my_thread==1) then
! in the calling routine threads go form 1 .. max_threads
#endif
#endif
ttt = mpi_wtime()
ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
endif
endif
#endif
#endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL
#ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */
#else /* WITH_OPENMP_TRADITIONAL */
if (
istripe<
stripe_
count
) then
if (
present(last_
stripe_
width)
) then
nl = stripe_width
nl =
merge(
stripe_width
, last_stripe_width, istripe<stripe_count)
else
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
if (istripe<stripe_count) then
nl = min(my_thread*thread_width-noff, l_nev-noff)
nl = stripe_width
if (nl<=0) then
else
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
&
MATH_DATATYPE
&
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
&
MATH_DATATYPE
&
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
&
_
openmp
" // &
&
_
openmp
" // &
#else
#else
&"
//
&
&"
//
&
#endif
#endif
&
PRECISION_SUFFIX
&
&
PRECISION_SUFFIX
&
)
)
return
return
endif
endif
endif
endif
endif
#endif /* not WITH_OPENMP_TRADITIONAL */
#endif /* not WITH_OPENMP_TRADITIONAL */
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
5b55ad7a
...
@@ -629,65 +629,65 @@ max_threads)
...
@@ -629,65 +629,65 @@ max_threads)
aux1
=
0.0_rck
aux1
=
0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
!#if 0
! original complex implementation without openmp. check performance
! ! original complex implementation without openmp. check performance
nlc
=
0
! number of local columns
! nlc = 0 ! number of local columns
do
j
=
1
,
lc
-1
! do j=1,lc-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if
(
lcx
>
0
)
then
! if (lcx>0) then
nlc
=
nlc
+1
! nlc = nlc+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a_mat
(
1
:
lr
,
lcx
))
! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
! endif
enddo
! enddo
! Get global dot products
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nlc
>
0
)
call
mpi_allreduce
(
aux1
,
aux2
,
int
(
nlc
,
kind
=
MPI_KIND
),
MPI_COMPLEX_PRECISION
,
MPI_SUM
,
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
#endif /* WITH_MPI */
!
!
! ! Transform
! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
!
! nlc = 0
! ! Transform
! do j=1,lc-1
!
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! nlc = 0
! if (lcx>0) then
! do j=1,lc-1
! nlc = nlc+1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! if (lcx>0) then
! nlc = nlc+1
! endif
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! enddo
!
#endif /* if 0 */
! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!This does not help performance due to the addition of two openmp barriers around the MPI call,
...
@@ -909,35 +909,35 @@ max_threads)
...
@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix
! of the tiles, so we can use strips of the matrix
#if 0
!
#if 0
! original complex implemetation check for performance
!
! original complex implemetation check for performance
umcCPU
(
1
:
l_cols
,
1
:
n_cols
)
=
0.0_rck
!
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU
(
1
:
l_rows
,
n_cols
+1
:
2
*
n_cols
)
=
0.0_rck
!
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
if
(
l_cols
>
0
.and.
l_rows
>
0
)
then
!
if (l_cols>0 .and. l_rows>0) then
do
i
=
0
,(
istep
*
nbw
-1
)/
tile_size
!
do i=0,(istep*nbw-1)/tile_size
!
lcs
=
i
*
l_cols_tile
+1
!
lcs = i*l_cols_tile+1
lce
=
min
(
l_cols
,(
i
+1
)
*
l_cols_tile
)
!
lce = min(l_cols,(i+1)*l_cols_tile)
if
(
lce
<
lcs
)
cycle
!
if (lce<lcs) cycle
!
lre
=
min
(
l_rows
,(
i
+1
)
*
l_rows_tile
)
!
lre = min(l_rows,(i+1)*l_rows_tile)
!
call
obj
%
timer
%
start
(
"blas"
)
!
call obj%timer%start("blas")
call
PRECISION_GEMM
(
'C'
,
'N'
,
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a_mat
(
1
,
lcs
),
ubound
(
a_mat
,
dim
=
1
),
&
!
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
ONE
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
!
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call
obj
%
timer
%
stop
(
"blas"
)
!
call obj%timer%stop("blas")
!
if
(
i
==
0
)
cycle
!
if (i==0) cycle
lre
=
min
(
l_rows
,
i
*
l_rows_tile
)
!
lre = min(l_rows,i*l_rows_tile)
call
obj
%
timer
%
start
(
"blas"
)
!
call obj%timer%start("blas")
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a_mat
(
1
,
lcs
),
lda
,
&
!
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
ONE
,
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
!
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call
obj
%
timer
%
stop
(
"blas"
)
!
call obj%timer%stop("blas")
enddo
!
enddo
!
endif
! (l_cols>0 .and. l_rows>0)
!
endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!
#endif /* if 0 */
!Code for Algorithm 4
!Code for Algorithm 4
...
@@ -1396,7 +1396,11 @@ max_threads)
...
@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
!$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads
=
omp_get_num_threads
()
n_threads
=
omp_get_num_threads
()
if
(
mod
(
n_threads
,
2
)
==
0
)
then
if
(
mod
(
n_threads
,
2
)
==
0
)
then
...
@@ -1424,13 +1428,30 @@ max_threads)
...
@@ -1424,13 +1428,30 @@ max_threads)
myend
=
mystart
+
work_per_thread
-
1
myend
=
mystart
+
work_per_thread
-
1
if
(
myend
>
lre
)
myend
=
lre
if
(
myend
>
lre
)
myend
=
lre
if
(
myend
-
mystart
+1
<
1
)
cycle
if
(
myend
-
mystart
+1
<
1
)
cycle
call
obj
%
timer
%
start
(
"blas"
)
if
(
useGPU
)
then
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
if
(
n_way
.gt.
1
)
then
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
print
*
,
"error more than 1 openmp thread used in GPU part of elpa2_bandred"
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
print
*
,
"this should never happen"
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
stop
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)
endif
call
obj
%
timer
%
stop
(
"blas"
)
call
obj
%
timer
%
start
(
"cublas"
)
call
cublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
myend
-
mystart
+1
,
&
lce
-
lcs
+1
,
2
*
n_cols
,
-
ONE
,
&
vmr_dev
,
cur_l_rows
,
(
umc_dev
+
(
lcs
-1
)
*
&
size_of_datatype
),
&
cur_l_cols
,
ONE
,
(
a_dev
+
(
lcs
-1
)
*
lda
*
&
size_of_datatype
),
lda
)
call
obj
%
timer
%
stop
(
"cublas"
)
else
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)