Commit 98d0fd81 authored by Andreas Marek's avatar Andreas Marek
Browse files

Blocking factor in trans_ev_band_to_full tunable

The blocking factor can be set now with the new interface
set method. The default is 3

Autotuning still has to be implemented.

This closses issue #44
parent 6a8db1f1
...@@ -122,7 +122,7 @@ ...@@ -122,7 +122,7 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
! hvm_dev is fist used and set in this routine ! hvm_dev is fist used and set in this routine
! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted ! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine ! tmp_dev is first used in this routine
...@@ -132,22 +132,27 @@ ...@@ -132,22 +132,27 @@
integer(kind=ik) :: i integer(kind=ik) :: i
#ifdef BAND_TO_FULL_BLOCKING #ifdef BAND_TO_FULL_BLOCKING
MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:) MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer(kind=ik) :: cwy_blocking, t_blocking, t_cols, t_rows integer(kind=ik) :: cwy_blocking, t_blocking, t_cols, t_rows
#endif #endif
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
logical :: successCUDA logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
integer :: blocking_factor
call obj%timer%start("trans_ev_band_to_full_& call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
#ifdef BAND_TO_FULL_BLOCKING
call obj%get("blocking_in_band_to_full",blocking_factor)
print *,"Blocking factor: ", blocking_factor
#endif
call obj%timer%start("mpi_communication") call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -432,7 +437,7 @@ ...@@ -432,7 +437,7 @@
#ifdef BAND_TO_FULL_BLOCKING #ifdef BAND_TO_FULL_BLOCKING
! t_blocking was formerly 2; 3 is a better choice ! t_blocking was formerly 2; 3 is a better choice
t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once t_blocking = blocking_factor ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked. ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
! if ( na >= ((t_blocking+1)*nbw) ) then ! if ( na >= ((t_blocking+1)*nbw) ) then
......
...@@ -132,7 +132,6 @@ static const elpa_index_int_entry_t int_entries[] = { ...@@ -132,7 +132,6 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL), INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL),
INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL), INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL),
INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid), INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid),
INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication"), INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication"),
INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication"), INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication"),
INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator"), INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator"),
...@@ -144,6 +143,8 @@ static const elpa_index_int_entry_t int_entries[] = { ...@@ -144,6 +143,8 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, \ INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, \
number_of_complex_kernels, complex_kernel_enumerate, \ number_of_complex_kernels, complex_kernel_enumerate, \
complex_kernel_is_valid, complex_kernel_name), complex_kernel_is_valid, complex_kernel_name),
INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, NULL, NULL, NULL, NULL),
BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0), BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0),
BOOL_ENTRY("gpu", "Use GPU acceleration", 0), BOOL_ENTRY("gpu", "Use GPU acceleration", 0),
BOOL_ENTRY("timings", "Enable time measurement", 0), BOOL_ENTRY("timings", "Enable time measurement", 0),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment