Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
14
Issues
14
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
5fd793c2
Commit
5fd793c2
authored
Apr 03, 2020
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Reactivate GPU kernel
parent
65cb91c0
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
60 additions
and
52 deletions
+60
-52
configure.ac
configure.ac
+17
-17
src/elpa1/elpa1_tridiag_template.F90
src/elpa1/elpa1_tridiag_template.F90
+20
-20
src/elpa2/elpa2_template.F90
src/elpa2/elpa2_template.F90
+8
-7
src/elpa_generalized/cannon.c
src/elpa_generalized/cannon.c
+2
-0
src/elpa_index.c
src/elpa_index.c
+5
-8
test/Fortran/test.F90
test/Fortran/test.F90
+8
-0
No files found.
configure.ac
View file @
5fd793c2
...
@@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [
...
@@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
complex_bgq
])
])
#
m4_define(elpa_m4_gpu_kernels, [
m4_define(elpa_m4_gpu_kernels, [
#
real_gpu
real_gpu
#
complex_gpu
complex_gpu
#
])
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq
gpu
])
m4_define(elpa_m4_all_kernels,
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
m4_foreach_w([elpa_m4_type],
...
@@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
...
@@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
#
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
@@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
...
@@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
fi
])
])
#
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
#
[Compile and always use the GPU version])],
[Compile and always use the GPU version])],
#
[],[with_gpu_support_only=no])
[],[with_gpu_support_only=no])
#
if test x"$with_gpu_support_only" = x"yes" ; then
if test x"$with_gpu_support_only" = x"yes" ; then
#
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
#
use_[]elpa_m4_kernel[]=no
use_[]elpa_m4_kernel[]=no
#
])
])
#
use_real_gpu=yes
use_real_gpu=yes
#
use_complex_gpu=yes
use_complex_gpu=yes
#
fi
fi
dnl
dnl
...
@@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
...
@@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
#
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
5fd793c2
...
@@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
...
@@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
&MATH_DATATYPE "
,
"tmp"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"tmp"
,
istat
,
errorMessage
)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate
(
uv_stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
uv_stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"uv_stored_cols"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"uv_stored_cols"
,
istat
,
errorMessage
)
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
if
(
useGPU
)
then
if
(
useGPU
)
then
num
=
(
max_local_rows
+1
)
*
size_of_datatype
num
=
(
max_local_rows
+1
)
*
size_of_datatype
...
@@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
...
@@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
call
c_f_pointer
(
u_row_host
,
u_row
,(/
num
/))
call
c_f_pointer
(
u_row_host
,
u_row
,(/
num
/))
num
=
(
max_local_rows
*
2
*
max_stored_uv
)
*
size_of_datatype
num
=
(
max_local_rows
*
2
*
max_stored_uv
)
*
size_of_datatype
successCUDA
=
cuda_
malloc_host
(
vu_stored_rows_host
,
num
)
successCUDA
=
cuda_
host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
check_host_alloc_cuda
(
"tridiag: vu_stored_rows_host"
,
successCUDA
)
cudaHostRegisterDefault
)
c
all
c_f_pointer
(
vu_stored_rows_host
,
vu_stored_rows
,(/
max_local_rows
,
2
*
max_stored_uv
/)
)
c
heck_host_register_cuda
(
"tridiag: vu_stored_roes"
,
successCUDA
)
num
=
(
max_local_cols
*
2
*
max_stored_uv
)
*
size_of_datatype
num
=
(
max_local_cols
*
2
*
max_stored_uv
)
*
size_of_datatype
!successCUDA = cuda_malloc_host(uv_stored_cols_host,num)
!check_alloc_cuda("tridiag: uv_stored_cols_host", successCUDA)
!call c_f_pointer(uv_stored_cols_host,uv_stored_cols,(/max_local_cols,2*max_stored_uv/))
successCUDA
=
cuda_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
successCUDA
=
cuda_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
cudaHostRegisterDefault
)
cudaHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: uv_stored_cols"
,
successCUDA
)
check_host_register_cuda
(
"tridiag: uv_stored_cols"
,
successCUDA
)
...
@@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
...
@@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
allocate
(
u_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
u_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_row"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"u_row"
,
istat
,
errorMessage
)
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
endif
endif
...
@@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
...
@@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
check_host_dealloc_cuda
(
"tridiag: u_row_host"
,
successCUDA
)
check_host_dealloc_cuda
(
"tridiag: u_row_host"
,
successCUDA
)
nullify
(
u_row
)
nullify
(
u_row
)
successCUDA
=
cuda_free_host
(
vu_stored_rows_host
)
check_host_dealloc_cuda
(
"tridiag: uv_stored_rows"
,
successCUDA
)
nullify
(
vu_stored_rows
)
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
))
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: uv_stored_cols"
,
successCUDA
)
check_host_unregister_cuda
(
"tridiag: uv_stored_cols"
,
successCUDA
)
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: vu_stored_rows"
,
successCUDA
)
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
))
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: e_vec"
,
successCUDA
)
check_host_unregister_cuda
(
"tridiag: e_vec"
,
successCUDA
)
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
))
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: d_vec"
,
successCUDA
)
check_host_unregister_cuda
(
"tridiag: d_vec"
,
successCUDA
)
else
else
deallocate
(
v_row
,
v_col
,
u_row
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
deallocate
(
v_row
,
v_col
,
u_row
,
u_col
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
stop
1
endif
endif
endif
endif
deallocate
(
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
endif
call
obj
%
timer
%
stop
(
"tridiag_&
call
obj
%
timer
%
stop
(
"tridiag_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&"
//
&
&"
//
&
...
...
src/elpa2/elpa2_template.F90
View file @
5fd793c2
...
@@ -386,7 +386,7 @@
...
@@ -386,7 +386,7 @@
endif
endif
do_useGPU_bandred
=
do_useGPU
do_useGPU_bandred
=
do_useGPU
do_useGPU_tridiag_band
=
do_useGPU
do_useGPU_tridiag_band
=
.false.
! not yet ported
do_useGPU_solve_tridi
=
do_useGPU
do_useGPU_solve_tridi
=
do_useGPU
do_useGPU_trans_ev_tridi_to_band
=
do_useGPU
do_useGPU_trans_ev_tridi_to_band
=
do_useGPU
do_useGPU_trans_ev_band_to_full
=
do_useGPU
do_useGPU_trans_ev_band_to_full
=
do_useGPU
...
@@ -403,12 +403,13 @@
...
@@ -403,12 +403,13 @@
endif
endif
do_useGPU_bandred
=
(
gpu
==
1
)
do_useGPU_bandred
=
(
gpu
==
1
)
call
obj
%
get
(
"gpu_tridiag_band"
,
gpu
,
error
)
! not yet ported
if
(
error
.ne.
ELPA_OK
)
then
!call obj%get("gpu_tridiag_band", gpu, error)
print
*
,
"Problem getting option for gpu_tridiag_band settings. Aborting..."
!if (error .ne. ELPA_OK) then
stop
! print *,"Problem getting option for gpu_tridiag_band settings. Aborting..."
endif
! stop
do_useGPU_tridiag_band
=
(
gpu
==
1
)
!endif
!do_useGPU_tridiag_band = (gpu == 1)
call
obj
%
get
(
"gpu_solve_tridi"
,
gpu
,
error
)
call
obj
%
get
(
"gpu_solve_tridi"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
if
(
error
.ne.
ELPA_OK
)
then
...
...
src/elpa_generalized/cannon.c
View file @
5fd793c2
...
@@ -86,6 +86,7 @@
...
@@ -86,6 +86,7 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
#include <mpi.h>
#include <mpi.h>
#if 0
#ifndef Add_
#ifndef Add_
#define numroc_ numroc
#define numroc_ numroc
#define dlacpy_ dlacpy
#define dlacpy_ dlacpy
...
@@ -102,6 +103,7 @@
...
@@ -102,6 +103,7 @@
#define pctranc_ pctranc
#define pctranc_ pctranc
#define pclacpy_ pclacpy
#define pclacpy_ pclacpy
#endif /* Add_ */
#endif /* Add_ */
#endif
//***********************************************************************************************************
//***********************************************************************************************************
...
...
src/elpa_index.c
View file @
5fd793c2
...
@@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = {
...
@@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa1
,
NULL
,
PRINT_YES
),
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa1
,
NULL
,
PRINT_YES
),
INT_ENTRY
(
"gpu_bandred"
,
"Use GPU acceleration for ELPA2 band reduction"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
INT_ENTRY
(
"gpu_bandred"
,
"Use GPU acceleration for ELPA2 band reduction"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa2
,
NULL
,
PRINT_YES
),
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa2
,
NULL
,
PRINT_YES
),
INT_ENTRY
(
"gpu_tridiag_band"
,
"Use GPU acceleration for ELPA2 tridiagonalization"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
//not yet ported to GPU
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa2
,
NULL
,
PRINT_YES
),
//INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
// cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY
(
"gpu_trans_ev_tridi_to_band"
,
"Use GPU acceleration for ELPA2 trans_ev_tridi_to_band"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
INT_ENTRY
(
"gpu_trans_ev_tridi_to_band"
,
"Use GPU acceleration for ELPA2 trans_ev_tridi_to_band"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa2
,
NULL
,
PRINT_YES
),
cardinality_bool
,
enumerate_identity
,
valid_with_gpu_elpa2
,
NULL
,
PRINT_YES
),
INT_ENTRY
(
"gpu_trans_ev_band_to_full"
,
"Use GPU acceleration for ELPA2 trans_ev_band_to_full"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
INT_ENTRY
(
"gpu_trans_ev_band_to_full"
,
"Use GPU acceleration for ELPA2 trans_ev_band_to_full"
,
1
,
ELPA_AUTOTUNE_MEDIUM
,
ELPA_AUTOTUNE_DOMAIN_ANY
,
\
...
@@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) {
...
@@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) {
}
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
// currently the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
static
int
real_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
static
int
real_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
...
@@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) {
...
@@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) {
}
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
// currenttly the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
static
int
complex_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
static
int
complex_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
...
...
test/Fortran/test.F90
View file @
5fd793c2
...
@@ -656,6 +656,14 @@ program test
...
@@ -656,6 +656,14 @@ program test
#endif
#endif
#ifdef TEST_SOLVER_2STAGE
#ifdef TEST_SOLVER_2STAGE
#if TEST_GPU == 1
#if defined TEST_REAL
kernel
=
ELPA_2STAGE_REAL_GPU
#endif
#if defined TEST_COMPLEX
kernel
=
ELPA_2STAGE_COMPLEX_GPU
#endif
#endif
call
e
%
set
(
KERNEL_KEY
,
kernel
,
error_elpa
)
call
e
%
set
(
KERNEL_KEY
,
kernel
,
error_elpa
)
#ifdef TEST_KERNEL
#ifdef TEST_KERNEL
assert_elpa_ok
(
error_elpa
)
assert_elpa_ok
(
error_elpa
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment