Commit 98d0fd81 authored by Andreas Marek's avatar Andreas Marek

Blocking factor in trans_ev_band_to_full tunable

The blocking factor can be set now with the new interface
set method. The default is 3

Autotuning still has to be implemented.

This closses issue #44
parent 6a8db1f1
......@@ -122,7 +122,7 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
......@@ -132,22 +132,27 @@
integer(kind=ik) :: i
#ifdef BAND_TO_FULL_BLOCKING
MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer(kind=ik) :: cwy_blocking, t_blocking, t_cols, t_rows
#endif
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer :: blocking_factor
call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
#ifdef BAND_TO_FULL_BLOCKING
call obj%get("blocking_in_band_to_full",blocking_factor)
print *,"Blocking factor: ", blocking_factor
#endif
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -432,7 +437,7 @@
#ifdef BAND_TO_FULL_BLOCKING
! t_blocking was formerly 2; 3 is a better choice
t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
t_blocking = blocking_factor ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
! if ( na >= ((t_blocking+1)*nbw) ) then
......
......@@ -132,7 +132,6 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL),
INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL),
INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid),
INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication"),
INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication"),
INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator"),
......@@ -144,6 +143,8 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, \
number_of_complex_kernels, complex_kernel_enumerate, \
complex_kernel_is_valid, complex_kernel_name),
INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, NULL, NULL, NULL, NULL),
BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0),
BOOL_ENTRY("gpu", "Use GPU acceleration", 0),
BOOL_ENTRY("timings", "Enable time measurement", 0),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment