Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
cc266ee2
Commit
cc266ee2
authored
Feb 24, 2021
by
Andreas Marek
Browse files
Merge branch 'master_pre_stage' into 'master'
Master pre stage See merge request
!58
parents
1d2b3806
20206932
Changes
23
Pipelines
2
Expand all
Hide whitespace changes
Inline
Side-by-side
CONTRIBUTING.md
View file @
cc266ee2
...
...
@@ -4,7 +4,7 @@ We are very happy and grateful if you are willing to help us improve *ELPA*.
Thus, we would like to make this process as simple as possible for you,
but at the same time still keep it manageable for us
For recommendations and suggestions, a simple email to us (
*elpa-libray at mpcdf.mpg.de*
) is sufficient!
For recommendations and suggestions, a simple email to us (
*elpa-libra
r
y at mpcdf.mpg.de*
) is sufficient!
If you would like to share with us your improvements, we suggest the following ways:
...
...
Changelog
View file @
cc266ee2
...
...
@@ -2,6 +2,9 @@ Changelog for next release
- not yet decided
Upcoming changes for ELPA 2021.05.001
- allow the user to set the mapping of MPI tasks to GPU id per set/get
Changelog for ELPA 2020.11.001
- this release containts mostly bugfixes:
...
...
Makefile.am
View file @
cc266ee2
...
...
@@ -138,7 +138,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if
WITH_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/elpa_index_gpu.cu
src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif
if
!WITH_MPI
...
...
generate_automake_test_programs.py
View file @
cc266ee2
...
...
@@ -24,6 +24,10 @@ gpu_flag = {
0
:
"-DTEST_GPU=0"
,
1
:
"-DTEST_GPU=1"
,
}
gpu_id_flag
=
{
0
:
"-DTEST_GPU_SET_ID=0"
,
1
:
"-DTEST_GPU_SET_ID=1"
,
}
matrix_flag
=
{
"random"
:
"-DTEST_MATRIX_RANDOM"
,
...
...
@@ -57,9 +61,10 @@ split_comm_flag = {
"by_elpa"
:
""
}
for
lang
,
m
,
g
,
q
,
t
,
p
,
d
,
s
,
lay
,
spl
in
product
(
sorted
(
language_flag
.
keys
()),
for
lang
,
m
,
g
,
gid
,
q
,
t
,
p
,
d
,
s
,
lay
,
spl
in
product
(
sorted
(
language_flag
.
keys
()),
sorted
(
matrix_flag
.
keys
()),
sorted
(
gpu_flag
.
keys
()),
sorted
(
gpu_id_flag
.
keys
()),
sorted
(
qr_flag
.
keys
()),
sorted
(
test_type_flag
.
keys
()),
sorted
(
prec_flag
.
keys
()),
...
...
@@ -68,6 +73,9 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted
(
layout_flag
.
keys
()),
sorted
(
split_comm_flag
.
keys
())):
if
gid
==
1
and
(
g
==
0
):
continue
if
lang
==
"C"
and
(
m
==
"analytic"
or
m
==
"toeplitz"
or
m
==
"frank"
or
lay
==
"all_layouts"
):
continue
...
...
@@ -178,11 +186,12 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
raise
Exception
(
"Oh no!"
)
endifs
+=
1
name
=
"validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{qrsuffix}{m}{layoutsuffix}{spl}"
.
format
(
name
=
"validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{
gpuidsuffix}{
qrsuffix}{m}{layoutsuffix}{spl}"
.
format
(
langsuffix
=
language_flag
[
lang
],
d
=
d
,
p
=
p
,
t
=
t
,
s
=
s
,
kernelsuffix
=
""
if
kernel
==
"nokernel"
else
"_"
+
kernel
,
gpusuffix
=
"gpu_"
if
g
else
""
,
gpuidsuffix
=
"set_gpu_id_"
if
gid
else
""
,
qrsuffix
=
"qr_"
if
q
else
""
,
m
=
m
,
layoutsuffix
=
"_all_layouts"
if
lay
==
"all_layouts"
else
""
,
...
...
@@ -227,6 +236,7 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
test_type_flag
[
t
],
solver_flag
[
s
],
gpu_flag
[
g
],
gpu_id_flag
[
gid
],
qr_flag
[
q
],
matrix_flag
[
m
]]
+
extra_flags
))
...
...
src/GPU/check_for_gpu.F90
View file @
cc266ee2
...
...
@@ -62,7 +62,7 @@ module mod_check_for_gpu
integer
(
kind
=
ik
),
intent
(
out
)
::
numberOfDevices
integer
(
kind
=
ik
)
::
deviceNumber
,
mpierr
,
maxNumberOfDevices
logical
::
gpuAvailable
integer
(
kind
=
ik
)
::
error
,
mpi_comm_all
integer
(
kind
=
ik
)
::
error
,
mpi_comm_all
,
use_gpu_id
,
min_use_gpu_id
!character(len=1024) :: envname
if
(
.not.
(
present
(
wantDebug
)))
then
...
...
@@ -83,42 +83,24 @@ module mod_check_for_gpu
stop
endif
if
(
cublasHandle
.ne.
-1
)
then
gpuAvailable
=
.true.
numberOfDevices
=
-1
if
(
myid
==
0
.and.
wantDebugMessage
)
then
print
*
,
"Skipping GPU init, should have already been initialized "
endif
return
else
if
(
myid
==
0
.and.
wantDebugMessage
)
then
print
*
,
"Initializing the GPU devices"
if
(
obj
%
is_set
(
"use_gpu_id"
)
==
1
)
then
call
obj
%
get
(
"use_gpu_id"
,
use_gpu_id
,
error
)
if
(
use_gpu_id
==
-99
)
then
print
*
,
"Problem you did not set which gpu id this task should use"
endif
endif
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
cuda_getdevicecount
(
numberOfDevices
)
if
(
.not.
(
success
))
then
print
*
,
"error in cuda_getdevicecount"
stop
1
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
! check whether gpu ud has been set for each proces
#ifdef WITH_MPI
call
mpi_allreduce
(
numberOfDevices
,
maxNumberOfDevices
,
1
,
MPI_INTEGER
,
MPI_MAX
,
mpi_comm_all
,
mpierr
)
call
mpi_allreduce
(
use_gpu_id
,
min_use_gpu_id
,
1
,
MPI_INTEGER
,
MPI_MAX
,
mpi_comm_all
,
mpierr
)
if
(
m
axNumberOfDevices
.ne.
numberOfDevices
)
then
print
*
,
"
Different number of GPU devices on MPI tasks!
"
print
*
,
"GPUs will NOT be used!"
gpuAvailable
=
.false.
return
endif
if
(
m
in_use_gpu_id
.lt.
0
)
then
print
*
,
"
Not all tasks have set which GPU id should be used
"
print
*
,
"GPUs will NOT be used!"
gpuAvailable
=
.false.
return
endif
#endif
if
(
numberOfDevices
.ne.
0
)
then
gpuAvailable
=
.true.
! Usage of GPU is possible since devices have been detected
if
(
myid
==
0
)
then
if
(
wantDebugMessage
)
then
...
...
@@ -127,8 +109,7 @@ module mod_check_for_gpu
endif
endif
deviceNumber
=
mod
(
myid
,
numberOfDevices
)
success
=
cuda_setdevice
(
deviceNumber
)
success
=
cuda_setdevice
(
use_gpu_id
)
if
(
.not.
(
success
))
then
print
*
,
"Cannot set CudaDevice"
...
...
@@ -137,14 +118,77 @@ module mod_check_for_gpu
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
success
=
cublas_create
(
cublasHandle
)
if
(
.not.
(
success
))
then
print
*
,
"Cannot create cublas handle"
stop
1
endif
endif
else
if
(
cublasHandle
.ne.
-1
)
then
gpuAvailable
=
.true.
numberOfDevices
=
-1
if
(
myid
==
0
.and.
wantDebugMessage
)
then
print
*
,
"Skipping GPU init, should have already been initialized "
endif
return
else
if
(
myid
==
0
.and.
wantDebugMessage
)
then
print
*
,
"Initializing the GPU devices"
endif
endif
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
cuda_getdevicecount
(
numberOfDevices
)
if
(
.not.
(
success
))
then
print
*
,
"error in cuda_getdevicecount"
stop
1
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call
mpi_allreduce
(
numberOfDevices
,
maxNumberOfDevices
,
1
,
MPI_INTEGER
,
MPI_MAX
,
mpi_comm_all
,
mpierr
)
if
(
maxNumberOfDevices
.ne.
numberOfDevices
)
then
print
*
,
"Different number of GPU devices on MPI tasks!"
print
*
,
"GPUs will NOT be used!"
gpuAvailable
=
.false.
return
endif
#endif
if
(
numberOfDevices
.ne.
0
)
then
gpuAvailable
=
.true.
! Usage of GPU is possible since devices have been detected
if
(
myid
==
0
)
then
if
(
wantDebugMessage
)
then
print
*
print
'(3(a,i0))'
,
'Found '
,
numberOfDevices
,
' GPUs'
endif
endif
deviceNumber
=
mod
(
myid
,
numberOfDevices
)
success
=
cuda_setdevice
(
deviceNumber
)
if
(
.not.
(
success
))
then
print
*
,
"Cannot set CudaDevice"
stop
1
endif
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
success
=
cublas_create
(
cublasHandle
)
if
(
.not.
(
success
))
then
print
*
,
"Cannot create cublas handle"
stop
1
endif
endif
endif
end
function
end
module
src/GPU/cuUtils.cu
View file @
cc266ee2
...
...
@@ -57,7 +57,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include "cuUtils_template.cu"
...
...
@@ -71,7 +71,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#if
def
WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include "cuUtils_template.cu"
...
...
src/GPU/elpa_index_gpu.cu
0 → 100644
View file @
cc266ee2
extern
"C"
{
int
gpu_count
()
{
int
count
;
cudaError_t
cuerr
=
cudaGetDeviceCount
(
&
count
);
if
(
cuerr
!=
cudaSuccess
)
{
count
=
-
1000
;
}
return
count
;
}
}
src/elpa.F90
View file @
cc266ee2
...
...
@@ -143,6 +143,16 @@
!> print *,"Could not setup ELPA object"
!> endif
!>
!> ! settings for GPU
!> call elpaInstance%set("gpu", 1, success) ! 1=on, 2=off
!> ! in case of GPU usage you have the choice whether ELPA
!> ! should automatically assign each MPI task to a certain GPU
!> ! (this is default) or whether you want to set this assignment
!> ! for _each_ task yourself
!> ! set assignment your self (only using one task here and assigning it
!> ! to GPU id 1)
!> if (my_rank .eq. 0) call elpaInstance%set("use_gpu_id", 1, success)
!>
!> ! if desired, set tunable run-time options
!> ! here we want to use the 2-stage solver
!> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success)
...
...
@@ -206,6 +216,16 @@
!> /* here we want to use the 2-stage solver */
!> elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
!>
!> /* settings for GPU */
!> elpa_set(handle, "gpu", 1, &error); /* 1=on, 2=off */
!> /* in case of GPU usage you have the choice whether ELPA
!> should automatically assign each MPI task to a certain GPU
!> (this is default) or whether you want to set this assignment
!> for _each_ task yourself
!> set assignment your self (only using one task here and assigning it
!> to GPU id 1) */
!> if (my_rank == 0) elpa_set(handle, "use_gpu_id", 1, &error);
!>
!> elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error);
!> \endcode
!> ... set and get all other options that are desired
...
...
src/elpa1/elpa1_auxiliary.F90
View file @
cc266ee2
...
...
@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
...
...
@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#if
def
WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
...
...
src/elpa1/elpa1_template.F90
View file @
cc266ee2
...
...
@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
call
obj
%
timer
%
stop
(
"elpa_solve_evp_&
...
...
@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success
=
.false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call
obj
%
get
(
"omp_threads"
,
nrThreads
,
error
)
if
(
nrThreads
.ne.
1
)
then
print
*
,
"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print
*
,
"setting 1 openmp thread now"
call
obj
%
set
(
"omp_threads"
,
1
,
error
)
nrThreads
=
1
call
omp_set_num_threads
(
nrThreads
)
endif
#endif
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
endif
...
...
@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
cc266ee2
...
...
@@ -612,26 +612,28 @@ subroutine tridiag_&
if
(
l_row_end
<
l_row_beg
)
cycle
#ifdef WITH_OPENMP_TRADITIONAL
if
(
mod
(
n_iter
,
n_threads
)
==
my_thread
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
.not.
useGPU
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
ONE
,
uc_p
(
l_col_beg
,
my_thread
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
matrixRows
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
call
obj
%
timer
%
stop
(
"OpenMP parallel"
)
do
i
=
0
,
max_threads
-1
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
if
(
.not.
(
useGPU
))
then
do
i
=
0
,
max_threads
-1
u_col
(
1
:
l_cols
)
=
u_col
(
1
:
l_cols
)
+
uc_p
(
1
:
l_cols
,
i
)
u_row
(
1
:
l_rows
)
=
u_row
(
1
:
l_rows
)
+
ur_p
(
1
:
l_rows
,
i
)
enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
...
...
src/elpa1/elpa_reduce_add_vectors.F90
View file @
cc266ee2
...
...
@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2
(:)
=
0
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
!$omp parallel &
!$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors.F90
View file @
cc266ee2
...
...
@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"elpa_transpose_vectors: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa1/elpa_transpose_vectors_ss.F90
View file @
cc266ee2
...
...
@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate
(
aux
(
((
nblks_tot
-
nblks_skip
+
lcm_s_t
-1
)/
lcm_s_t
)
*
nblk
*
nvc
))
check_allocate
(
"elpa_transpose_vectors_ss: aux"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do
n
=
0
,
lcm_s_t
-1
...
...
src/elpa2/compute_hh_trafo.F90
View file @
cc266ee2
...
...
@@ -58,11 +58,10 @@ l_nev, &
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
off
,
ncols
,
istripe
,
&
#ifdef WITH_OPENMP_TRADITIONAL
my_thread
,
thread_width
,
&
my_thread
,
thread_width
,
kernel
,
last_stripe_width
)
#else
last_stripe_width
,
&
last_stripe_width
,
kernel
)
#endif
kernel
)
use
precision
use
elpa_abstract_impl
...
...
@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */
integer
(
kind
=
ik
),
intent
(
in
)
::
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
),
optional
::
last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
...
...
@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then
if (my_thread==1) then
! in the calling routine threads go form 1 .. max_threads
#endif
ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL
endif
#endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */
if (
istripe<
stripe_
count
) then
nl = stripe_width
if (
present(last_
stripe_
width)
) then
nl =
merge(
stripe_width
, last_stripe_width, istripe<stripe_count)
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
&
MATH_DATATYPE
&
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("
compute_hh_trafo_
&
&
MATH_DATATYPE
&
#ifdef WITH_OPENMP_TRADITIONAL
&
_
openmp
" // &
&
_
openmp
" // &
#else
&"
//
&
&"
//
&
#endif
&
PRECISION_SUFFIX
&
)
&
PRECISION_SUFFIX
&
)
return
return
endif
endif
endif
#endif /* not WITH_OPENMP_TRADITIONAL */
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
cc266ee2
...
...
@@ -629,65 +629,65 @@ max_threads)
aux1
=
0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc
=
0
! number of local columns
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a_mat
(
1
:
lr
,
lcx
))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nlc
>
0
)
call
mpi_allreduce
(
aux1
,
aux2
,
int
(
nlc
,
kind
=
MPI_KIND
),
MPI_COMPLEX_PRECISION
,
MPI_SUM
,
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
! Transform
<