diff --git a/configure.ac b/configure.ac index 86540ecff833c1deea6a17fa6c357b1bfdf0b155..207a61bd282e4917fdf9cf75506a4bcb06df0e56 100644 --- a/configure.ac +++ b/configure.ac @@ -748,12 +748,12 @@ m4_define(elpa_m4_bgq_kernels, [ complex_bgq ]) -#m4_define(elpa_m4_gpu_kernels, [ -# real_gpu -# complex_gpu -#]) +m4_define(elpa_m4_gpu_kernels, [ + real_gpu + complex_gpu +]) -m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq]) +m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_all_kernels, m4_foreach_w([elpa_m4_type], @@ -794,7 +794,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable]) ELPA_SELECT_KERNELS([avx],[enable]) ELPA_SELECT_KERNELS([avx2],[enable]) ELPA_SELECT_KERNELS([avx512],[enable]) -#ELPA_SELECT_KERNELS([gpu],[disable]) +ELPA_SELECT_KERNELS([gpu],[disable]) ELPA_SELECT_KERNELS([bgp],[disable]) ELPA_SELECT_KERNELS([bgq],[disable]) @@ -836,16 +836,16 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ fi ]) -#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only], -# [Compile and always use the GPU version])], -# [],[with_gpu_support_only=no]) -#if test x"$with_gpu_support_only" = x"yes" ; then -# m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ -# use_[]elpa_m4_kernel[]=no -# ]) -# use_real_gpu=yes -# use_complex_gpu=yes -#fi +AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only], + [Compile and always use the GPU version])], + [],[with_gpu_support_only=no]) +if test x"$with_gpu_support_only" = x"yes" ; then + m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ + use_[]elpa_m4_kernel[]=no + ]) + use_real_gpu=yes + use_complex_gpu=yes +fi dnl @@ -1307,7 +1307,7 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"]) if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support]) - #AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build]) + AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build]) ELPA_2STAGE_COMPLEX_GPU_COMPILED=1 ELPA_2STAGE_REAL_GPU_COMPILED=1 diff --git a/src/elpa1/elpa1_tridiag_template.F90 b/src/elpa1/elpa1_tridiag_template.F90 index db8352cac6358e50735fdcb5b47fd12434dba949..3680e42b6f231ade569e5f14878186f8f18dbcb7 100644 --- a/src/elpa1/elpa1_tridiag_template.F90 +++ b/src/elpa1/elpa1_tridiag_template.F90 @@ -276,9 +276,13 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p &MATH_DATATYPE ", "tmp", istat, errorMessage) ! allocate v_row 1 element longer to allow store and broadcast tau together with it - allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage) - call check_alloc("tridiag_& - &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage) + allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage) + call check_alloc("tridiag_& + &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage) + + allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage) + call check_alloc("tridiag_& + &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage) if (useGPU) then num = (max_local_rows+1) * size_of_datatype @@ -302,16 +306,11 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p call c_f_pointer(u_row_host,u_row,(/num/)) num = (max_local_rows * 2*max_stored_uv) * size_of_datatype - successCUDA = cuda_malloc_host(vu_stored_rows_host,num) - check_host_alloc_cuda("tridiag: vu_stored_rows_host", successCUDA) - call c_f_pointer(vu_stored_rows_host,vu_stored_rows,(/max_local_rows,2*max_stored_uv/)) + successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,& + cudaHostRegisterDefault) + check_host_register_cuda("tridiag: vu_stored_roes", successCUDA) num = (max_local_cols * 2*max_stored_uv) * size_of_datatype - !successCUDA = cuda_malloc_host(uv_stored_cols_host,num) - !check_alloc_cuda("tridiag: uv_stored_cols_host", successCUDA) - - !call c_f_pointer(uv_stored_cols_host,uv_stored_cols,(/max_local_cols,2*max_stored_uv/)) - successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,& cudaHostRegisterDefault) check_host_register_cuda("tridiag: uv_stored_cols", successCUDA) @@ -350,10 +349,6 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage) call check_alloc("tridiag_& &MATH_DATATYPE ", "u_row", istat, errorMessage) - - allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage) - call check_alloc("tridiag_& - &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage) endif @@ -1134,26 +1129,31 @@ call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_p check_host_dealloc_cuda("tridiag: u_row_host", successCUDA) nullify(u_row) - successCUDA = cuda_free_host(vu_stored_rows_host) - check_host_dealloc_cuda("tridiag: uv_stored_rows", successCUDA) - nullify(vu_stored_rows) - successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t)) check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA) + successCUDA = cuda_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: vu_stored_rows", successCUDA) + successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t)) check_host_unregister_cuda("tridiag: e_vec", successCUDA) successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t)) check_host_unregister_cuda("tridiag: d_vec", successCUDA) else - deallocate(v_row, v_col, u_row, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage) + deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag: error when deallocating "//errorMessage stop 1 endif endif + deallocate(vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag: error when deallocating "//errorMessage + stop 1 + endif + call obj%timer%stop("tridiag_& &MATH_DATATYPE& &" // & diff --git a/src/elpa2/elpa2_template.F90 b/src/elpa2/elpa2_template.F90 index 7f7011176b18899ce7468a308fb2a562679c5c35..b6d8bfdb146b4d05077821d5f46fa1f52692c947 100644 --- a/src/elpa2/elpa2_template.F90 +++ b/src/elpa2/elpa2_template.F90 @@ -386,7 +386,7 @@ endif do_useGPU_bandred = do_useGPU - do_useGPU_tridiag_band = do_useGPU + do_useGPU_tridiag_band = .false. ! not yet ported do_useGPU_solve_tridi = do_useGPU do_useGPU_trans_ev_tridi_to_band = do_useGPU do_useGPU_trans_ev_band_to_full = do_useGPU @@ -403,12 +403,13 @@ endif do_useGPU_bandred = (gpu == 1) - call obj%get("gpu_tridiag_band", gpu, error) - if (error .ne. ELPA_OK) then - print *,"Problem getting option for gpu_tridiag_band settings. Aborting..." - stop - endif - do_useGPU_tridiag_band = (gpu == 1) + ! not yet ported + !call obj%get("gpu_tridiag_band", gpu, error) + !if (error .ne. ELPA_OK) then + ! print *,"Problem getting option for gpu_tridiag_band settings. Aborting..." + ! stop + !endif + !do_useGPU_tridiag_band = (gpu == 1) call obj%get("gpu_solve_tridi", gpu, error) if (error .ne. ELPA_OK) then diff --git a/src/elpa_generalized/cannon.c b/src/elpa_generalized/cannon.c index 38bb5557a3273589d43f9adaab1addc9b2360f52..ae73adb176552cc3af0a93ad49e6dfe8199542ca 100644 --- a/src/elpa_generalized/cannon.c +++ b/src/elpa_generalized/cannon.c @@ -86,6 +86,7 @@ #ifdef WITH_MPI #include +#if 0 #ifndef Add_ #define numroc_ numroc #define dlacpy_ dlacpy @@ -102,6 +103,7 @@ #define pctranc_ pctranc #define pclacpy_ pclacpy #endif /* Add_ */ +#endif //*********************************************************************************************************** diff --git a/src/elpa_index.c b/src/elpa_index.c index 94368996ae8df0e651ee08524d81781c9009478e..1f51fb83b5dda7ce1a64858f673f0aa4a33c83bd 100644 --- a/src/elpa_index.c +++ b/src/elpa_index.c @@ -224,8 +224,9 @@ static const elpa_index_int_entry_t int_entries[] = { cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES), INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), - INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ - cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), + //not yet ported to GPU + //INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ + // cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ @@ -704,9 +705,7 @@ static const char *real_kernel_name(int kernel) { } #define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \ - kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1 -// currently the GPU kernel is never valid -// previously: kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1 + kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1 static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) { int solver = elpa_index_get_int_value(index, "solver", NULL); @@ -745,9 +744,7 @@ static const char *complex_kernel_name(int kernel) { } #define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \ - kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1 -// currenttly the GPU kernel is never valid -// previously: kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1 + kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1 static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) { int solver = elpa_index_get_int_value(index, "solver", NULL); diff --git a/test/Fortran/test.F90 b/test/Fortran/test.F90 index da697d98668dac7f2c1c60447f576bdcd28b79db..c18a5e19547530bdb9dfa1095abd24da3730a3b9 100644 --- a/test/Fortran/test.F90 +++ b/test/Fortran/test.F90 @@ -656,6 +656,14 @@ program test #endif #ifdef TEST_SOLVER_2STAGE +#if TEST_GPU == 1 +#if defined TEST_REAL + kernel = ELPA_2STAGE_REAL_GPU +#endif +#if defined TEST_COMPLEX + kernel = ELPA_2STAGE_COMPLEX_GPU +#endif +#endif call e%set(KERNEL_KEY, kernel, error_elpa) #ifdef TEST_KERNEL assert_elpa_ok(error_elpa)