Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
10
Issues
10
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
ab19611c
Commit
ab19611c
authored
May 07, 2018
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changes to OpenMP
parent
04c4c95f
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
197 additions
and
106 deletions
+197
-106
src/elpa1/elpa1_merge_systems_real_template.F90
src/elpa1/elpa1_merge_systems_real_template.F90
+41
-41
src/elpa1/elpa1_solve_tridi_real_template.F90
src/elpa1/elpa1_solve_tridi_real_template.F90
+21
-18
src/elpa1/elpa1_template.F90
src/elpa1/elpa1_template.F90
+10
-2
src/elpa1/elpa1_tridiag_template.F90
src/elpa1/elpa1_tridiag_template.F90
+16
-9
src/elpa1/elpa_cholesky_template.F90
src/elpa1/elpa_cholesky_template.F90
+11
-1
src/elpa1/elpa_reduce_add_vectors.F90
src/elpa1/elpa_reduce_add_vectors.F90
+6
-2
src/elpa1/elpa_solve_tridi_impl_public.F90
src/elpa1/elpa_solve_tridi_impl_public.F90
+12
-1
src/elpa1/elpa_transpose_vectors.F90
src/elpa1/elpa_transpose_vectors.F90
+2
-1
src/elpa2/compute_hh_trafo.F90
src/elpa2/compute_hh_trafo.F90
+4
-3
src/elpa2/elpa2_bandred_template.F90
src/elpa2/elpa2_bandred_template.F90
+16
-12
src/elpa2/elpa2_template.F90
src/elpa2/elpa2_template.F90
+16
-5
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
+17
-11
src/elpa2/elpa2_tridiag_band_template.F90
src/elpa2/elpa2_tridiag_band_template.F90
+1
-0
src/elpa2/pack_unpack_cpu.F90
src/elpa2/pack_unpack_cpu.F90
+1
-0
src/elpa_index.c
src/elpa_index.c
+23
-0
No files found.
src/elpa1/elpa1_merge_systems_real_template.F90
View file @
ab19611c
...
...
@@ -57,16 +57,19 @@
subroutine
merge_systems_
&
&
PRECISION
&
(
obj
,
na
,
nm
,
d
,
e
,
q
,
ldq
,
nqoff
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
&
l_col
,
p_col
,
l_col_out
,
p_col_out
,
npc_0
,
npc_n
,
useGPU
,
wantDebug
,
success
)
l_col
,
p_col
,
l_col_out
,
p_col_out
,
npc_0
,
npc_n
,
useGPU
,
wantDebug
,
success
,
max_threads
)
use
cuda_functions
use
iso_c_binding
use
precision
use
elpa_abstract_impl
#ifdef WITH_OPENMP
use
omp_lib
#endif
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nm
,
ldq
,
nqoff
,
nblk
,
matrixCols
,
mpi_comm_rows
,
&
mpi_comm_cols
,
npc_0
,
npc_n
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nm
,
ldq
,
nqoff
,
nblk
,
matrixCols
,
mpi_comm_rows
,
&
mpi_comm_cols
,
npc_0
,
npc_n
integer
(
kind
=
ik
),
intent
(
in
)
::
l_col
(
na
),
p_col
(
na
),
l_col_out
(
na
),
p_col_out
(
na
)
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
d
(
na
),
e
#ifdef USE_ASSUMED_SIZE
...
...
@@ -74,49 +77,46 @@
#else
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
q
(
ldq
,
matrixCols
)
#endif
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
logical
,
intent
(
out
)
::
success
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
logical
,
intent
(
out
)
::
success
! TODO: play with max_strip. If it was larger, matrices being multiplied
! might be larger as well!
integer
(
kind
=
ik
),
parameter
::
max_strip
=
128
real
(
kind
=
REAL_DATATYPE
)
::
PRECISION_LAMCH
,
PRECISION_LAPY2
real
(
kind
=
REAL_DATATYPE
)
::
beta
,
sig
,
s
,
c
,
t
,
tau
,
rho
,
eps
,
tol
,
&
qtrans
(
2
,
2
),
dmax
,
zmax
,
d1new
,
d2new
real
(
kind
=
REAL_DATATYPE
)
::
z
(
na
),
d1
(
na
),
d2
(
na
),
z1
(
na
),
delta
(
na
),
&
dbase
(
na
),
ddiff
(
na
),
ev_scale
(
na
),
tmp
(
na
)
real
(
kind
=
REAL_DATATYPE
)
::
d1u
(
na
),
zu
(
na
),
d1l
(
na
),
zl
(
na
)
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
qtmp1
(:,:),
qtmp2
(:,:),
ev
(:,:)
integer
(
kind
=
ik
),
parameter
::
max_strip
=
128
real
(
kind
=
REAL_DATATYPE
)
::
PRECISION_LAMCH
,
PRECISION_LAPY2
real
(
kind
=
REAL_DATATYPE
)
::
beta
,
sig
,
s
,
c
,
t
,
tau
,
rho
,
eps
,
tol
,
&
qtrans
(
2
,
2
),
dmax
,
zmax
,
d1new
,
d2new
real
(
kind
=
REAL_DATATYPE
)
::
z
(
na
),
d1
(
na
),
d2
(
na
),
z1
(
na
),
delta
(
na
),
&
dbase
(
na
),
ddiff
(
na
),
ev_scale
(
na
),
tmp
(
na
)
real
(
kind
=
REAL_DATATYPE
)
::
d1u
(
na
),
zu
(
na
),
d1l
(
na
),
zl
(
na
)
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
qtmp1
(:,:),
qtmp2
(:,:),
ev
(:,:)
#ifdef WITH_OPENMP
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
z_p
(:,:)
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
z_p
(:,:)
#endif
integer
(
kind
=
ik
)
::
i
,
j
,
na1
,
na2
,
l_rows
,
l_cols
,
l_rqs
,
l_rqe
,
&
l_rqm
,
ns
,
info
integer
(
kind
=
ik
)
::
l_rnm
,
nnzu
,
nnzl
,
ndef
,
ncnt
,
max_local_cols
,
&
l_cols_qreorg
,
np
,
l_idx
,
nqcols1
,
nqcols2
integer
(
kind
=
ik
)
::
my_proc
,
n_procs
,
my_prow
,
my_pcol
,
np_rows
,
&
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
np_next
,
np_prev
,
np_rem
integer
(
kind
=
ik
)
::
idx
(
na
),
idx1
(
na
),
idx2
(
na
)
integer
(
kind
=
ik
)
::
coltyp
(
na
),
idxq1
(
na
),
idxq2
(
na
)
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
integer
(
kind
=
ik
)
::
gemm_dim_k
,
gemm_dim_l
,
gemm_dim_m
integer
(
kind
=
C_intptr_T
)
::
qtmp1_dev
,
qtmp2_dev
,
ev_dev
logical
::
successCUDA
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
real
integer
(
kind
=
ik
)
::
i
,
j
,
na1
,
na2
,
l_rows
,
l_cols
,
l_rqs
,
l_rqe
,
&
l_rqm
,
ns
,
info
integer
(
kind
=
ik
)
::
l_rnm
,
nnzu
,
nnzl
,
ndef
,
ncnt
,
max_local_cols
,
&
l_cols_qreorg
,
np
,
l_idx
,
nqcols1
,
nqcols2
integer
(
kind
=
ik
)
::
my_proc
,
n_procs
,
my_prow
,
my_pcol
,
np_rows
,
&
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
np_next
,
np_prev
,
np_rem
integer
(
kind
=
ik
)
::
idx
(
na
),
idx1
(
na
),
idx2
(
na
)
integer
(
kind
=
ik
)
::
coltyp
(
na
),
idxq1
(
na
),
idxq2
(
na
)
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
integer
(
kind
=
ik
)
::
gemm_dim_k
,
gemm_dim_l
,
gemm_dim_m
integer
(
kind
=
C_intptr_T
)
::
qtmp1_dev
,
qtmp2_dev
,
ev_dev
logical
::
successCUDA
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
real
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
max_threads
,
my_thread
integer
(
kind
=
ik
)
::
omp_get_max_threads
,
omp_get_thread_num
max_threads
=
omp_get_max_threads
()
integer
(
kind
=
ik
)
::
my_thread
allocate
(
z_p
(
na
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
...
...
@@ -438,7 +438,7 @@
#ifdef WITH_OPENMP
call
obj
%
timer
%
start
(
"OpenMP parallel"
//
PRECISION_SUFFIX
)
! OPENMP_CHANGE here
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
my_thread
=
omp_get_thread_num
()
!$OMP DO
...
...
src/elpa1/elpa1_solve_tridi_real_template.F90
View file @
ab19611c
...
...
@@ -57,31 +57,32 @@
subroutine
solve_tridi_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
na
,
nev
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
wantDebug
,
success
)
mpi_comm_cols
,
useGPU
,
wantDebug
,
success
,
max_threads
)
use
precision
use
elpa_abstract_impl
implicit
none
#include "../../src/general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
d
(
na
),
e
(
na
)
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
d
(
na
),
e
(
na
)
#ifdef USE_ASSUMED_SIZE
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
q
(
ldq
,
*
)
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
q
(
ldq
,
*
)
#else
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
q
(
ldq
,
matrixCols
)
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
q
(
ldq
,
matrixCols
)
#endif
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
logical
,
intent
(
out
)
::
success
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
logical
,
intent
(
out
)
::
success
integer
(
kind
=
ik
)
::
i
,
j
,
n
,
np
,
nc
,
nev1
,
l_cols
,
l_rows
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
i
,
j
,
n
,
np
,
nc
,
nev1
,
l_cols
,
l_rows
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
),
allocatable
::
limits
(:),
l_col
(:),
p_col
(:),
l_col_bc
(:),
p_col_bc
(:)
integer
(
kind
=
ik
),
allocatable
::
limits
(:),
l_col
(:),
p_col
(:),
l_col_bc
(:),
p_col_bc
(:)
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
character
(
20
)
::
gpuString
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
character
(
20
)
::
gpuString
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
if
(
useGPU
)
then
gpuString
=
"_gpu"
...
...
@@ -152,7 +153,7 @@ subroutine solve_tridi_&
call
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
l_cols
,
nev1
,
nc
,
d
(
nc
+1
),
e
(
nc
+1
),
q
,
ldq
,
nblk
,
&
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
)
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
,
max_threads
)
if
(
.not.
(
success
))
then
call
obj
%
timer
%
stop
(
"solve_tridi"
//
PRECISION_SUFFIX
)
return
...
...
@@ -335,7 +336,7 @@ subroutine solve_tridi_&
&
PRECISION
&
(
obj
,
nlen
,
nmid
,
d
(
noff
+1
),
e
(
noff
+
nmid
),
q
,
ldq
,
noff
,
&
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
l_col
,
p_col
,
&
l_col_bc
,
p_col_bc
,
np_off
,
nprocs
,
useGPU
,
wantDebug
,
success
)
l_col_bc
,
p_col_bc
,
np_off
,
nprocs
,
useGPU
,
wantDebug
,
success
,
max_threads
)
if
(
.not.
(
success
))
return
else
! Not last merge, leave dense column distribution
...
...
@@ -343,7 +344,7 @@ subroutine solve_tridi_&
&
PRECISION
&
(
obj
,
nlen
,
nmid
,
d
(
noff
+1
),
e
(
noff
+
nmid
),
q
,
ldq
,
noff
,
&
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
l_col
(
noff
+1
),
p_col
(
noff
+1
),
&
l_col
(
noff
+1
),
p_col
(
noff
+1
),
np_off
,
nprocs
,
useGPU
,
wantDebug
,
success
)
l_col
(
noff
+1
),
p_col
(
noff
+1
),
np_off
,
nprocs
,
useGPU
,
wantDebug
,
success
,
max_threads
)
if
(
.not.
(
success
))
return
endif
end
subroutine
merge_recursive_
&
...
...
@@ -354,7 +355,7 @@ subroutine solve_tridi_&
subroutine
solve_tridi_col_
&
&
PRECISION_AND_SUFFIX
&
(
obj
,
na
,
nev
,
nqoff
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
)
(
obj
,
na
,
nev
,
nqoff
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
useGPU
,
wantDebug
,
success
,
max_threads
)
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
...
...
@@ -385,6 +386,8 @@ subroutine solve_tridi_&
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
call
obj
%
timer
%
start
(
"solve_tridi_col"
//
PRECISION_SUFFIX
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
mpi_comm_rows
,
my_prow
,
mpierr
)
...
...
@@ -557,7 +560,7 @@ subroutine solve_tridi_&
&
PRECISION
&
(
obj
,
nlen
,
nmid
,
d
(
noff
+1
),
e
(
noff
+
nmid
),
q
,
ldq
,
nqoff
+
noff
,
nblk
,
&
matrixCols
,
mpi_comm_rows
,
mpi_comm_self
,
l_col
(
noff
+1
),
p_col_i
(
noff
+1
),
&
l_col
(
noff
+1
),
p_col_o
(
noff
+1
),
0
,
1
,
useGPU
,
wantDebug
,
success
)
l_col
(
noff
+1
),
p_col_o
(
noff
+1
),
0
,
1
,
useGPU
,
wantDebug
,
success
,
max_threads
)
if
(
.not.
(
success
))
return
enddo
...
...
src/elpa1/elpa1_template.F90
View file @
ab19611c
...
...
@@ -110,6 +110,7 @@ function elpa_solve_evp_&
mpi_comm_all
,
check_pd
,
i
,
error
logical
::
do_bandred
,
do_solve
,
do_trans_ev
integer
(
kind
=
ik
)
::
nrThreads
,
omp_get_num_threads
call
obj
%
timer
%
start
(
"elpa_solve_evp_&
&MATH_DATATYPE&
...
...
@@ -117,6 +118,13 @@ function elpa_solve_evp_&
&PRECISION&
&"
)
#ifdef WITH_OPENMP
nrThreads
=
omp_get_num_threads
()
#else
nrThreads
=
1
#endif
success
=
.true.
if
(
present
(
q
))
then
...
...
@@ -309,7 +317,7 @@ function elpa_solve_evp_&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
obj
,
na
,
a
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
ev
,
e
,
tau
,
do_useGPU_tridiag
,
wantDebug
)
&
(
obj
,
na
,
a
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
ev
,
e
,
tau
,
do_useGPU_tridiag
,
wantDebug
,
nrThreads
)
call
obj
%
timer
%
stop
(
"forward"
)
endif
!do_bandred
...
...
@@ -324,7 +332,7 @@ function elpa_solve_evp_&
#if COMPLEXCASE == 1
q_real
,
l_rows
,
&
#endif
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
do_useGPU_solve_tridi
,
wantDebug
,
success
)
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
do_useGPU_solve_tridi
,
wantDebug
,
success
,
nrThreads
)
call
obj
%
timer
%
stop
(
"solve"
)
if
(
.not.
(
success
))
return
endif
!do_solve
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
ab19611c
...
...
@@ -96,12 +96,15 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
a_mat
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
d_vec
,
e_vec
,
tau
,
useGPU
,
wantDebug
)
(
obj
,
na
,
a_mat
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
d_vec
,
e_vec
,
tau
,
useGPU
,
wantDebug
,
max_threads
)
use
cuda_functions
use
iso_c_binding
use
precision
use
elpa_abstract_impl
use
matrix_plot
#ifdef WITH_OPENMP
use
omp_lib
#endif
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
...
...
@@ -150,11 +153,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer
(
kind
=
ik
)
::
istep
,
i
,
j
,
l_col_beg
,
l_col_end
,
l_row_beg
,
l_row_end
integer
(
kind
=
ik
)
::
tile_size
,
l_rows_per_tile
,
l_cols_per_tile
integer
(
kind
=
c_intptr_t
)
::
a_offset
integer
(
kind
=
c_intptr_t
)
::
a_offset
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
my_thread
,
n_threads
,
max_threads
,
n_iter
integer
(
kind
=
ik
)
::
omp_get_thread_num
,
omp_get_num_threads
,
omp_get_max_threads
integer
(
kind
=
ik
)
::
my_thread
,
n_threads
,
n_iter
#endif
real
(
kind
=
REAL_DATATYPE
)
::
vnorm2
...
...
@@ -305,8 +308,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE "
,
"u_col"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP
max_threads
=
omp_get_max_threads
()
allocate
(
ur_p
(
max_local_rows
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"ur_p"
,
istat
,
errorMessage
)
...
...
@@ -502,7 +503,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&
_
&
&
PRECISION
&
(
obj
,
v_row
,
ubound
(
v_row
,
dim
=
1
),
mpi_comm_rows
,
v_col
,
ubound
(
v_col
,
dim
=
1
),
mpi_comm_cols
,
&
1
,
istep
-1
,
1
,
nblk
)
1
,
istep
-1
,
1
,
nblk
,
max_threads
)
! Calculate u = (A + VU**T + UV**T)*v
...
...
@@ -532,7 +533,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
my_thread
=
omp_get_thread_num
()
n_threads
=
omp_get_num_threads
()
! debug REMOVE again
print
*
,
"debug"
if
(
n_threads
.ne.
max_threads
)
then
print
*
,
"WTF?"
endif
n_iter
=
0
...
...
@@ -710,7 +717,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&
_
&
&
PRECISION
&
(
obj
,
u_row
,
ubound
(
u_row
,
dim
=
1
),
mpi_comm_rows
,
u_col
,
ubound
(
u_col
,
dim
=
1
),
&
mpi_comm_cols
,
istep
-1
,
1
,
nblk
)
mpi_comm_cols
,
istep
-1
,
1
,
nblk
,
max_threads
)
endif
...
...
@@ -733,7 +740,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&
_
&
&
PRECISION
&
(
obj
,
u_col
,
ubound
(
u_col
,
dim
=
1
),
mpi_comm_cols
,
u_row
,
ubound
(
u_row
,
dim
=
1
),
&
mpi_comm_rows
,
1
,
istep
-1
,
1
,
nblk
)
mpi_comm_rows
,
1
,
istep
-1
,
1
,
nblk
,
max_threads
)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
...
...
src/elpa1/elpa_cholesky_template.F90
View file @
ab19611c
...
...
@@ -48,6 +48,9 @@
use
elpa_mpi
use
precision
use
elpa_abstract_impl
#ifdef WITH_OPENMP
use
omp_lib
#endif
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
...
...
@@ -68,6 +71,7 @@
logical
::
success
integer
(
kind
=
ik
)
::
istat
,
debug
,
error
character
(
200
)
::
errorMessage
integer
(
kind
=
ik
)
::
max_threads
call
obj
%
timer
%
start
(
"elpa_cholesky_&
&MATH_DATATYPE&
...
...
@@ -75,6 +79,12 @@
&PRECISION&
&"
)
#ifdef WITH_OPENMP
max_threads
=
omp_get_num_threads
()
#else
max_threads
=
1
#endif
na
=
obj
%
na
lda
=
obj
%
local_nrows
nblk
=
obj
%
nblk
...
...
@@ -285,7 +295,7 @@
&
PRECISION
&
(
obj
,
tmatc
,
ubound
(
tmatc
,
dim
=
1
),
mpi_comm_cols
,
&
tmatr
,
ubound
(
tmatr
,
dim
=
1
),
mpi_comm_rows
,
&
n
,
na
,
nblk
,
nblk
)
n
,
na
,
nblk
,
nblk
,
max_threads
)
do
i
=
0
,(
na
-1
)/
tile_size
lcs
=
max
(
l_colx
,
i
*
l_cols_tile
+1
)
...
...
src/elpa1/elpa_reduce_add_vectors.F90
View file @
ab19611c
...
...
@@ -50,7 +50,7 @@ subroutine elpa_reduce_add_vectors_&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
vmat_s
,
ld_s
,
comm_s
,
vmat_t
,
ld_t
,
comm_t
,
nvr
,
nvc
,
nblk
)
(
obj
,
vmat_s
,
ld_s
,
comm_s
,
vmat_t
,
ld_t
,
comm_t
,
nvr
,
nvc
,
nblk
,
nrThreads
)
!-------------------------------------------------------------------------------
! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
...
...
@@ -81,7 +81,7 @@ subroutine elpa_reduce_add_vectors_&
use
elpa_abstract_impl
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
ld_s
,
comm_s
,
ld_t
,
comm_t
,
nvr
,
nvc
,
nblk
MATH_DATATYPE
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
vmat_s
(
ld_s
,
nvc
)
MATH_DATATYPE
(
kind
=
C_DATATYPE_KIND
),
intent
(
inout
)
::
vmat_t
(
ld_t
,
nvc
)
...
...
@@ -91,6 +91,8 @@ subroutine elpa_reduce_add_vectors_&
integer
(
kind
=
ik
)
::
n
,
lc
,
k
,
i
,
ips
,
ipt
,
ns
,
nl
,
mpierr
integer
(
kind
=
ik
)
::
lcm_s_t
,
nblks_tot
integer
(
kind
=
ik
)
::
auxstride
integer
(
kind
=
ik
),
intent
(
in
)
::
nrThreads
call
obj
%
timer
%
start
(
"elpa_reduce_add_vectors_&
&MATH_DATATYPE&
...
...
@@ -119,6 +121,8 @@ subroutine elpa_reduce_add_vectors_&
aux1
(:)
=
0
aux2
(:)
=
0
#ifdef WITH_OPENMP
call
omp_set_num_threads
(
nrThreads
)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_solve_tridi_impl_public.F90
View file @
ab19611c
...
...
@@ -63,6 +63,9 @@
&
_
impl
use
precision
use
elpa_abstract_impl
#ifdef WITH_OPENMP
use
omp_lib
#endif
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
...
...
@@ -78,6 +81,7 @@
logical
::
success
integer
::
debug
,
error
integer
::
max_threads
call
obj
%
timer
%
start
(
"elpa_solve_tridi_public_&
&MATH_DATATYPE&
...
...
@@ -90,6 +94,12 @@
ldq
=
obj
%
local_nrows
matrixCols
=
obj
%
local_ncols
#ifdef WITH_OPENMP
max_threads
=
omp_get_num_threads
()
#else
max_threads
=
1
#endif
call
obj
%
get
(
"mpi_comm_rows"
,
mpi_comm_rows
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option. Aborting..."
...
...
@@ -116,7 +126,8 @@
call
solve_tridi_
&
&
PRECISION
&
&
_
private_impl
(
obj
,
na
,
nev
,
d
,
e
,
q
,
ldq
,
nblk
,
matrixCols
,
&
mpi_comm_rows
,
mpi_comm_cols
,
.false.
,
wantDebug
,
success
)
mpi_comm_rows
,
mpi_comm_cols
,
.false.
,
wantDebug
,
success
,
&
max_threads
)
call
obj
%
timer
%
stop
(
"elpa_solve_tridi_public_&
&MATH_DATATYPE&
...
...
src/elpa1/elpa_transpose_vectors.F90
View file @
ab19611c
...
...
@@ -54,7 +54,7 @@ subroutine elpa_transpose_vectors_&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
vmat_s
,
ld_s
,
comm_s
,
vmat_t
,
ld_t
,
comm_t
,
nvs
,
nvr
,
nvc
,
nblk
)
(
obj
,
vmat_s
,
ld_s
,
comm_s
,
vmat_t
,
ld_t
,
comm_t
,
nvs
,
nvr
,
nvc
,
nblk
,
nrThreads
)
!-------------------------------------------------------------------------------
! This routine transposes an array of vectors which are distributed in
...
...
@@ -94,6 +94,7 @@ subroutine elpa_transpose_vectors_&
integer
(
kind
=
ik
)
::
n
,
lc
,
k
,
i
,
ips
,
ipt
,
ns
,
nl
,
mpierr
integer
(
kind
=
ik
)
::
lcm_s_t
,
nblks_tot
,
nblks_comm
,
nblks_skip
integer
(
kind
=
ik
)
::
auxstride
integer
(
kind
=
ik
),
intent
(
in
)
::
nrThreads
call
obj
%
timer
%
start
(
"elpa_transpose_vectors_&
&MATH_DATATYPE&
...
...
src/elpa2/compute_hh_trafo.F90
View file @
ab19611c
...
...
@@ -51,9 +51,9 @@
&
_
&
#endif
&
PRECISION
&
(
obj
,
useGPU
,
wantDebug
,
a
,
a_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
&
(
obj
,
useGPU
,
wantDebug
,
a
,
a_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
max_threads
,
&
#ifdef WITH_OPENMP
max_threads
,
l_nev
,
&
l_nev
,
&
#endif
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
#if REALCASE == 1
...
...
@@ -123,6 +123,7 @@
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_width
,
a_dim2
,
stripe_count
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
#ifndef WITH_OPENMP
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
#if REALCASE == 1
...
...
@@ -135,7 +136,7 @@
#endif
#else /* WITH_OPENMP */
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
,
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
)
::
l_nev
,
thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
ab19611c
...
...
@@ -65,13 +65,11 @@
&
_
&
&
PRECISION
&
(
obj
,
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
,
tmat
,
&
tmat_dev
,
wantDebug
,
useGPU
,
success
&
tmat_dev
,
wantDebug
,
useGPU
,
success
,
&
#if REALCASE == 1
,
useQR
)
#endif
#if COMPLEXCASE == 1
)
useQR
,
&
#endif
max_threads
)
!-------------------------------------------------------------------------------
! bandred_real/complex: Reduces a distributed symmetric matrix to band form
...
...
@@ -187,6 +185,7 @@
&
MATH_DATATYPE
logical
::
useGPU_reduction_lower_block_to_tridiagonal
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
call
obj
%
timer
%
start
(
"bandred_&
...
...
@@ -957,7 +956,7 @@
&
PRECISION
&
(
obj
,
vmrCUDA
,
cur_l_rows
,
mpi_comm_rows
,
&
umcCUDA
(
cur_l_cols
*
n_cols
+
1
),
cur_l_cols
,
&
mpi_comm_cols
,
1
,
istep
*
nbw
,
n_cols
,
nblk
)
mpi_comm_cols
,
1
,
istep
*
nbw
,
n_cols
,
nblk
,
max_threads
)
else
! useGPU
call
elpa_transpose_vectors_
&
&
MATH_DATATYPE
&
...
...
@@ -965,7 +964,7 @@
&
PRECISION
&
(
obj
,
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
mpi_comm_rows
,
&
umcCPU
(
1
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
mpi_comm_cols
,
&
1
,
istep
*
nbw
,
n_cols
,
nblk
)
1
,
istep
*
nbw
,
n_cols
,
nblk
,
max_threads
)
endif
! Calculate umc = A**T * vmr
...
...
@@ -1011,7 +1010,7 @@
#ifdef WITH_OPENMP
#if REALCASE == 1
n_way
=
omp_get_max_threads
()
n_way
=
max_threads
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
...
...
@@ -1210,7 +1209,7 @@
&
PRECISION
&
(
obj
,
vmrCUDA
(
cur_l_rows
*
n_cols
+
1
),
cur_l_rows
,
&
mpi_comm_rows
,
umcCUDA
,
&
cur_l_cols
,
mpi_comm_cols
,
istep
*
nbw
,
n_cols
,
nblk
)
cur_l_cols
,
mpi_comm_cols
,
istep
*
nbw
,
n_cols
,
nblk
,
max_threads
)
else
! useGPU
call
elpa_reduce_add_vectors_
&
...
...
@@ -1219,7 +1218,7 @@
&
PRECISION
&
(
obj
,
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
),
mpi_comm_rows
,
&
umcCPU
,
ubound
(
umcCPU
,
dim
=
1
),
mpi_comm_cols
,
&
istep
*
nbw
,
n_cols
,
nblk
)
istep
*
nbw
,
n_cols
,
nblk
,
max_threads
)
endif
! useGPU
endif
! tile_size < istep*nbw .or. n_way > 1
...
...
@@ -1415,7 +1414,7 @@
&
PRECISION
&
(
obj
,
umcCUDA
,
cur_l_cols
,
mpi_comm_cols
,
&
vmrCUDA
(
cur_l_rows
*
n_cols
+
1
),
cur_l_rows
,
mpi_comm_rows
,
&
1
,
istep
*
nbw
,
n_cols
,
nblk
)
1
,
istep
*
nbw
,
n_cols
,
nblk
,
max_threads
)
successCUDA
=
cuda_memcpy
(
vmr_dev
,
&