Commit 5fd793c2 authored by Andreas Marek's avatar Andreas Marek

Reactivate GPU kernel

parent 65cb91c0
......@@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
#m4_define(elpa_m4_gpu_kernels, [
# real_gpu
# complex_gpu
#])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
#ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
])
#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
# [Compile and always use the GPU version])],
# [],[with_gpu_support_only=no])
#if test x"$with_gpu_support_only" = x"yes" ; then
# m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
# use_[]elpa_m4_kernel[]=no
# ])
# use_real_gpu=yes
# use_complex_gpu=yes
#fi
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
[Compile and always use the GPU version])],
[],[with_gpu_support_only=no])
if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
fi
dnl
......@@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
#AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
......
......@@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
......@@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
call c_f_pointer(u_row_host,u_row,(/num/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_malloc_host(vu_stored_rows_host,num)
check_host_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA)
call c_f_pointer(vu_stored_rows_host,vu_stored_rows,(/max_local_rows,2*max_stored_uv/))
successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
!successCUDA = cuda_malloc_host(uv_stored_cols_host,num)
!check_alloc_cuda("tridiag: uv_stored_cols_host", successCUDA)
!call c_f_pointer(uv_stored_cols_host,uv_stored_cols,(/max_local_cols,2*max_stored_uv/))
successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
......@@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
endif
......@@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p
check_host_dealloc_cuda("tridiag: u_row_host", successCUDA)
nullify(u_row)
successCUDA = cuda_free_host(vu_stored_rows_host)
check_host_dealloc_cuda("tridiag: uv_stored_rows", successCUDA)
nullify(vu_stored_rows)
successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA)
successCUDA = cuda_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: vu_stored_rows", successCUDA)
successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: e_vec", successCUDA)
successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t))
check_host_unregister_cuda("tridiag: d_vec", successCUDA)
else
deallocate(v_row, v_col, u_row, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage
stop 1
endif
endif
deallocate(vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag: error when deallocating "//errorMessage
stop 1
endif
call obj%timer%stop("tridiag_&
&MATH_DATATYPE&
&" // &
......
......@@ -386,7 +386,7 @@
endif
do_useGPU_bandred = do_useGPU
do_useGPU_tridiag_band = do_useGPU
do_useGPU_tridiag_band = .false. ! not yet ported
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev_tridi_to_band = do_useGPU
do_useGPU_trans_ev_band_to_full = do_useGPU
......@@ -403,12 +403,13 @@
endif
do_useGPU_bandred = (gpu == 1)
call obj%get("gpu_tridiag_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_tridiag_band settings. Aborting..."
stop
endif
do_useGPU_tridiag_band = (gpu == 1)
! not yet ported
!call obj%get("gpu_tridiag_band", gpu, error)
!if (error .ne. ELPA_OK) then
! print *,"Problem getting option for gpu_tridiag_band settings. Aborting..."
! stop
!endif
!do_useGPU_tridiag_band = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
......
......@@ -86,6 +86,7 @@
#ifdef WITH_MPI
#include <mpi.h>
#if 0
#ifndef Add_
#define numroc_ numroc
#define dlacpy_ dlacpy
......@@ -102,6 +103,7 @@
#define pctranc_ pctranc
#define pclacpy_ pclacpy
#endif /* Add_ */
#endif
//***********************************************************************************************************
......
......@@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
//not yet ported to GPU
//INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
// cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
......@@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) {
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1
// currently the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......@@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) {
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1
// currenttly the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......
......@@ -656,6 +656,14 @@ program test
#endif
#ifdef TEST_SOLVER_2STAGE
#if TEST_GPU == 1
#if defined TEST_REAL
kernel = ELPA_2STAGE_REAL_GPU
#endif
#if defined TEST_COMPLEX
kernel = ELPA_2STAGE_COMPLEX_GPU
#endif
#endif
call e%set(KERNEL_KEY, kernel, error_elpa)
#ifdef TEST_KERNEL
assert_elpa_ok(error_elpa)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment