Commit 0183d8ca authored by Pavel Kus's avatar Pavel Kus

tile size can be influence by the user

tile size is used in elpa1_tridiag and elpa2_bandred.
Iits value is computed based on block size and least common multiple of
number of processes rows and colums, but then can be enlarged to some
multiple of this value. How much it should be enlarged can now be
influenced by min_tile_size parameter.
parent 968c9f9b
......@@ -201,10 +201,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#if COMPLEXCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp_real(:)
#endif
integer(kind=ik) :: min_tile_size, error
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
......@@ -250,7 +251,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! tile_size is thus nblk * 6
!
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
......
......@@ -174,6 +174,7 @@
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: min_tile_size, error
#if REALCASE == 1
logical, intent(in) :: useQR
......@@ -276,7 +277,21 @@
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
......
......@@ -71,6 +71,8 @@ static int band_to_full_cardinality();
static int band_to_full_enumerate(int i);
static int band_to_full_is_valid(elpa_index_t index, int n, int new_value);
static int min_tile_size_cardinality();
static int na_is_valid(elpa_index_t index, int n, int new_value);
static int nev_is_valid(elpa_index_t index, int n, int new_value);
static int bw_is_valid(elpa_index_t index, int n, int new_value);
......@@ -159,6 +161,9 @@ static const elpa_index_int_entry_t int_entries[] = {
number_of_complex_kernels, complex_kernel_enumerate, \
complex_kernel_is_valid, complex_kernel_name),
INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
min_tile_size_cardinality, NULL, NULL, NULL),
//INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL),
INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL),
//BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL),
......@@ -653,6 +658,11 @@ static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) {
abort();
}
static int min_tile_size_cardinality() {
/* TODO */
fprintf(stderr, "TODO on %s:%d\n", __FILE__, __LINE__);
abort();
}
elpa_index_t elpa_index_instance() {
elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment