Commit 5fd793c2 authored by Andreas Marek's avatar Andreas Marek

Reactivate GPU kernel

parent 65cb91c0
...@@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [ ...@@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq complex_bgq
]) ])
#m4_define(elpa_m4_gpu_kernels, [ m4_define(elpa_m4_gpu_kernels, [
# real_gpu real_gpu
# complex_gpu complex_gpu
#]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq]) m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable]) ...@@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable]) ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable]) ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable]) ELPA_SELECT_KERNELS([avx512],[enable])
#ELPA_SELECT_KERNELS([gpu],[disable]) ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable]) ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable]) ELPA_SELECT_KERNELS([bgq],[disable])
...@@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi fi
]) ])
#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only], AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
# [Compile and always use the GPU version])], [Compile and always use the GPU version])],
# [],[with_gpu_support_only=no]) [],[with_gpu_support_only=no])
#if test x"$with_gpu_support_only" = x"yes" ; then if test x"$with_gpu_support_only" = x"yes" ; then
# m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
# use_[]elpa_m4_kernel[]=no use_[]elpa_m4_kernel[]=no
# ]) ])
# use_real_gpu=yes use_real_gpu=yes
# use_complex_gpu=yes use_complex_gpu=yes
#fi fi
dnl dnl
...@@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ ...@@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"]) AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support]) AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
#AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build]) AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1 ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1 ELPA_2STAGE_REAL_GPU_COMPILED=1
......
...@@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p ...@@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
&MATH_DATATYPE ", "tmp", istat, errorMessage) &MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it ! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage) allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_& call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage) &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
if (useGPU) then if (useGPU) then
num = (max_local_rows+1) * size_of_datatype num = (max_local_rows+1) * size_of_datatype
...@@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p ...@@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
call c_f_pointer(u_row_host,u_row,(/num/)) call c_f_pointer(u_row_host,u_row,(/num/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_malloc_host(vu_stored_rows_host,num) successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
check_host_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA) cudaHostRegisterDefault)
call c_f_pointer(vu_stored_rows_host,vu_stored_rows,(/max_local_rows,2*max_stored_uv/)) check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
!successCUDA = cuda_malloc_host(uv_stored_cols_host,num)
!check_alloc_cuda("tridiag: uv_stored_cols_host", successCUDA)
!call c_f_pointer(uv_stored_cols_host,uv_stored_cols,(/max_local_cols,2*max_stored_uv/))
successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,& successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
cudaHostRegisterDefault) cudaHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successCUDA) check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
...@@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p ...@@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage) allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_& call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage) &MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
endif endif
...@@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p ...@@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
check_host_dealloc_cuda("tridiag: u_row_host", successCUDA) check_host_dealloc_cuda("tridiag: u_row_host", successCUDA)
nullify(u_row) nullify(u_row)
successCUDA = cuda_free_host(vu_stored_rows_host)
check_host_dealloc_cuda("tridiag: uv_stored_rows", successCUDA)
nullify(vu_stored_rows)
successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t)) successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA) check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA)
successCUDA = cuda_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: vu_stored_rows", successCUDA)
successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t)) successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: e_vec", successCUDA) check_host_unregister_cuda("tridiag: e_vec", successCUDA)
successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t)) successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: d_vec", successCUDA) check_host_unregister_cuda("tridiag: d_vec", successCUDA)
else else
deallocate(v_row, v_col, u_row, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage) deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage print *,"tridiag: error when deallocating "//errorMessage
stop 1 stop 1
endif endif
endif endif
deallocate(vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage
stop 1
endif
call obj%timer%stop("tridiag_& call obj%timer%stop("tridiag_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
......
...@@ -386,7 +386,7 @@ ...@@ -386,7 +386,7 @@
endif endif
do_useGPU_bandred = do_useGPU do_useGPU_bandred = do_useGPU
do_useGPU_tridiag_band = do_useGPU do_useGPU_tridiag_band = .false. ! not yet ported
do_useGPU_solve_tridi = do_useGPU do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev_tridi_to_band = do_useGPU do_useGPU_trans_ev_tridi_to_band = do_useGPU
do_useGPU_trans_ev_band_to_full = do_useGPU do_useGPU_trans_ev_band_to_full = do_useGPU
...@@ -403,12 +403,13 @@ ...@@ -403,12 +403,13 @@
endif endif
do_useGPU_bandred = (gpu == 1) do_useGPU_bandred = (gpu == 1)
call obj%get("gpu_tridiag_band", gpu, error) ! not yet ported
if (error .ne. ELPA_OK) then !call obj%get("gpu_tridiag_band", gpu, error)
print *,"Problem getting option for gpu_tridiag_band settings. Aborting..." !if (error .ne. ELPA_OK) then
stop ! print *,"Problem getting option for gpu_tridiag_band settings. Aborting..."
endif ! stop
do_useGPU_tridiag_band = (gpu == 1) !endif
!do_useGPU_tridiag_band = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error) call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
......
...@@ -86,6 +86,7 @@ ...@@ -86,6 +86,7 @@
#ifdef WITH_MPI #ifdef WITH_MPI
#include <mpi.h> #include <mpi.h>
#if 0
#ifndef Add_ #ifndef Add_
#define numroc_ numroc #define numroc_ numroc
#define dlacpy_ dlacpy #define dlacpy_ dlacpy
...@@ -102,6 +103,7 @@ ...@@ -102,6 +103,7 @@
#define pctranc_ pctranc #define pctranc_ pctranc
#define pclacpy_ pclacpy #define pclacpy_ pclacpy
#endif /* Add_ */ #endif /* Add_ */
#endif
//*********************************************************************************************************** //***********************************************************************************************************
......
...@@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = { ...@@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES), cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ //not yet ported to GPU
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), //INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
// cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
...@@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) { ...@@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) {
} }
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \ #define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1 kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
// currently the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) { static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL); int solver = elpa_index_get_int_value(index, "solver", NULL);
...@@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) { ...@@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) {
} }
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \ #define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1 kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
// currenttly the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) { static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL); int solver = elpa_index_get_int_value(index, "solver", NULL);
......
...@@ -656,6 +656,14 @@ program test ...@@ -656,6 +656,14 @@ program test
#endif #endif
#ifdef TEST_SOLVER_2STAGE #ifdef TEST_SOLVER_2STAGE
#if TEST_GPU == 1
#if defined TEST_REAL
kernel = ELPA_2STAGE_REAL_GPU
#endif
#if defined TEST_COMPLEX
kernel = ELPA_2STAGE_COMPLEX_GPU
#endif
#endif
call e%set(KERNEL_KEY, kernel, error_elpa) call e%set(KERNEL_KEY, kernel, error_elpa)
#ifdef TEST_KERNEL #ifdef TEST_KERNEL
assert_elpa_ok(error_elpa) assert_elpa_ok(error_elpa)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment