diff --git a/src/elpa1/elpa1_tridiag_template.F90 b/src/elpa1/elpa1_tridiag_template.F90 index c3e318c8fd07de167d058af4f47b477d975db707..c0c3d1173355de27c733fe7ff31b5be8a9ab6408 100644 --- a/src/elpa1/elpa1_tridiag_template.F90 +++ b/src/elpa1/elpa1_tridiag_template.F90 @@ -201,10 +201,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ #if COMPLEXCASE == 1 real(kind=REAL_DATATYPE), allocatable :: tmp_real(:) #endif + integer(kind=ik) :: min_tile_size, error integer(kind=ik) :: istat character(200) :: errorMessage character(20) :: gpuString - integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& + integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& &PRECISION& &_& &MATH_DATATYPE @@ -250,7 +251,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ! tile_size is thus nblk * 6 ! tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + ! make tile_size a smallest possible multiple of previously defined tile size, such that it is + ! larger or equal to min_tile_size + ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value + ! it can, however, be set by the user + call obj%get("min_tile_size", min_tile_size ,error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if(min_tile_size == 0) then + ! not set by the user, use the default value + min_tile_size = 128*max(np_rows, np_cols) + endif + tile_size = ((min_tile_size-1)/tile_size+1)*tile_size l_rows_per_tile = tile_size/np_rows ! local rows of a tile l_cols_per_tile = tile_size/np_cols ! local cols of a tile diff --git a/src/elpa2/elpa2_bandred_template.F90 b/src/elpa2/elpa2_bandred_template.F90 index 9aed2c62cb7f17165849de4bec49e415596123a2..9aa2edd7028c4ee28af0468f81c4e69ca6819eb3 100644 --- a/src/elpa2/elpa2_bandred_template.F90 +++ b/src/elpa2/elpa2_bandred_template.F90 @@ -174,6 +174,7 @@ logical :: successCUDA integer(kind=ik) :: istat character(200) :: errorMessage + integer(kind=ik) :: min_tile_size, error #if REALCASE == 1 logical, intent(in) :: useQR @@ -276,7 +277,21 @@ ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + ! make tile_size a smallest possible multiple of previously defined tile size, such that it is + ! larger or equal to min_tile_size + ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value + ! it can, however, be set by the user + call obj%get("min_tile_size", min_tile_size ,error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option. Aborting..." + stop + endif + if(min_tile_size == 0) then + ! not set by the user, use the default value + min_tile_size = 128*max(np_rows, np_cols) + endif + tile_size = ((min_tile_size-1)/tile_size+1)*tile_size l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile diff --git a/src/elpa_index.c b/src/elpa_index.c index d679747f5aca28314950fd4f1a2270a66040561e..cb432bbebe6ec85814989da100fb6716856179dc 100644 --- a/src/elpa_index.c +++ b/src/elpa_index.c @@ -71,6 +71,8 @@ static int band_to_full_cardinality(); static int band_to_full_enumerate(int i); static int band_to_full_is_valid(elpa_index_t index, int n, int new_value); +static int min_tile_size_cardinality(); + static int na_is_valid(elpa_index_t index, int n, int new_value); static int nev_is_valid(elpa_index_t index, int n, int new_value); static int bw_is_valid(elpa_index_t index, int n, int new_value); @@ -159,6 +161,9 @@ static const elpa_index_int_entry_t int_entries[] = { number_of_complex_kernels, complex_kernel_enumerate, \ complex_kernel_is_valid, complex_kernel_name), + INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, + min_tile_size_cardinality, NULL, NULL, NULL), + //INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL), INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL), //BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL), @@ -653,6 +658,11 @@ static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) { abort(); } +static int min_tile_size_cardinality() { + /* TODO */ + fprintf(stderr, "TODO on %s:%d\n", __FILE__, __LINE__); + abort(); +} elpa_index_t elpa_index_instance() { elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct));