From 0183d8caf705b84ddf4b178ea358b1ebcf80c160 Mon Sep 17 00:00:00 2001 From: Pavel Kus Date: Thu, 26 Apr 2018 11:56:35 +0200 Subject: [PATCH] tile size can be influence by the user tile size is used in elpa1_tridiag and elpa2_bandred. Iits value is computed based on block size and least common multiple of number of processes rows and colums, but then can be enlarged to some multiple of this value. How much it should be enlarged can now be influenced by min_tile_size parameter. --- src/elpa1/elpa1_tridiag_template.F90 | 19 +++++++++++++++++-- src/elpa2/elpa2_bandred_template.F90 | 17 ++++++++++++++++- src/elpa_index.c | 10 ++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/elpa1/elpa1_tridiag_template.F90 b/src/elpa1/elpa1_tridiag_template.F90 index c3e318c8..c0c3d117 100644 --- a/src/elpa1/elpa1_tridiag_template.F90 +++ b/src/elpa1/elpa1_tridiag_template.F90 @@ -201,10 +201,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ #if COMPLEXCASE == 1 real(kind=REAL_DATATYPE), allocatable :: tmp_real(:) #endif + integer(kind=ik) :: min_tile_size, error integer(kind=ik) :: istat character(200) :: errorMessage character(20) :: gpuString - integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& + integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& &PRECISION& &_& &MATH_DATATYPE @@ -250,7 +251,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ! tile_size is thus nblk * 6 ! tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + ! make tile_size a smallest possible multiple of previously defined tile size, such that it is + ! larger or equal to min_tile_size + ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value + ! it can, however, be set by the user + call obj%get("min_tile_size", min_tile_size ,error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if(min_tile_size == 0) then + ! not set by the user, use the default value + min_tile_size = 128*max(np_rows, np_cols) + endif + tile_size = ((min_tile_size-1)/tile_size+1)*tile_size l_rows_per_tile = tile_size/np_rows ! local rows of a tile l_cols_per_tile = tile_size/np_cols ! local cols of a tile diff --git a/src/elpa2/elpa2_bandred_template.F90 b/src/elpa2/elpa2_bandred_template.F90 index 9aed2c62..9aa2edd7 100644 --- a/src/elpa2/elpa2_bandred_template.F90 +++ b/src/elpa2/elpa2_bandred_template.F90 @@ -174,6 +174,7 @@ logical :: successCUDA integer(kind=ik) :: istat character(200) :: errorMessage + integer(kind=ik) :: min_tile_size, error #if REALCASE == 1 logical, intent(in) :: useQR @@ -276,7 +277,21 @@ ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + ! make tile_size a smallest possible multiple of previously defined tile size, such that it is + ! larger or equal to min_tile_size + ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value + ! it can, however, be set by the user + call obj%get("min_tile_size", min_tile_size ,error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if(min_tile_size == 0) then + ! not set by the user, use the default value + min_tile_size = 128*max(np_rows, np_cols) + endif + tile_size = ((min_tile_size-1)/tile_size+1)*tile_size l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile diff --git a/src/elpa_index.c b/src/elpa_index.c index d679747f..cb432bbe 100644 --- a/src/elpa_index.c +++ b/src/elpa_index.c @@ -71,6 +71,8 @@ static int band_to_full_cardinality(); static int band_to_full_enumerate(int i); static int band_to_full_is_valid(elpa_index_t index, int n, int new_value); +static int min_tile_size_cardinality(); + static int na_is_valid(elpa_index_t index, int n, int new_value); static int nev_is_valid(elpa_index_t index, int n, int new_value); static int bw_is_valid(elpa_index_t index, int n, int new_value); @@ -159,6 +161,9 @@ static const elpa_index_int_entry_t int_entries[] = { number_of_complex_kernels, complex_kernel_enumerate, \ complex_kernel_is_valid, complex_kernel_name), + INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, + min_tile_size_cardinality, NULL, NULL, NULL), + //INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL), INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL), //BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL), @@ -653,6 +658,11 @@ static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) { abort(); } +static int min_tile_size_cardinality() { + /* TODO */ + fprintf(stderr, "TODO on %s:%d\n", __FILE__, __LINE__); + abort(); +} elpa_index_t elpa_index_instance() { elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct)); -- GitLab