Commit 834f142b authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'assign_tasks_to_gpu' into 'master_pre_stage'

Assign tasks to gpu

See merge request !55
parents 3a46cfb1 a198daba
...@@ -138,7 +138,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \ ...@@ -138,7 +138,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif endif
if WITH_GPU_VERSION if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu libelpa@SUFFIX@_private_la_SOURCES += src/GPU/elpa_index_gpu.cu src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif endif
if !WITH_MPI if !WITH_MPI
......
...@@ -24,6 +24,10 @@ gpu_flag = { ...@@ -24,6 +24,10 @@ gpu_flag = {
0: "-DTEST_GPU=0", 0: "-DTEST_GPU=0",
1: "-DTEST_GPU=1", 1: "-DTEST_GPU=1",
} }
gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0",
1: "-DTEST_GPU_SET_ID=1",
}
matrix_flag = { matrix_flag = {
"random": "-DTEST_MATRIX_RANDOM", "random": "-DTEST_MATRIX_RANDOM",
...@@ -57,9 +61,10 @@ split_comm_flag = { ...@@ -57,9 +61,10 @@ split_comm_flag = {
"by_elpa": "" "by_elpa": ""
} }
for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()), for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(matrix_flag.keys()), sorted(matrix_flag.keys()),
sorted(gpu_flag.keys()), sorted(gpu_flag.keys()),
sorted(gpu_id_flag.keys()),
sorted(qr_flag.keys()), sorted(qr_flag.keys()),
sorted(test_type_flag.keys()), sorted(test_type_flag.keys()),
sorted(prec_flag.keys()), sorted(prec_flag.keys()),
...@@ -68,6 +73,9 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()), ...@@ -68,6 +73,9 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(layout_flag.keys()), sorted(layout_flag.keys()),
sorted(split_comm_flag.keys())): sorted(split_comm_flag.keys())):
if gid == 1 and (g == 0 ):
continue
if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"): if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"):
continue continue
...@@ -178,11 +186,12 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()), ...@@ -178,11 +186,12 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
raise Exception("Oh no!") raise Exception("Oh no!")
endifs += 1 endifs += 1
name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format( name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{gpuidsuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format(
langsuffix=language_flag[lang], langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s, d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel, kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if g else "", gpusuffix="gpu_" if g else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "", qrsuffix="qr_" if q else "",
m=m, m=m,
layoutsuffix="_all_layouts" if lay == "all_layouts" else "", layoutsuffix="_all_layouts" if lay == "all_layouts" else "",
...@@ -227,6 +236,7 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()), ...@@ -227,6 +236,7 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
test_type_flag[t], test_type_flag[t],
solver_flag[s], solver_flag[s],
gpu_flag[g], gpu_flag[g],
gpu_id_flag[gid],
qr_flag[q], qr_flag[q],
matrix_flag[m]] + extra_flags)) matrix_flag[m]] + extra_flags))
......
...@@ -62,7 +62,7 @@ module mod_check_for_gpu ...@@ -62,7 +62,7 @@ module mod_check_for_gpu
integer(kind=ik), intent(out) :: numberOfDevices integer(kind=ik), intent(out) :: numberOfDevices
integer(kind=ik) :: deviceNumber, mpierr, maxNumberOfDevices integer(kind=ik) :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable logical :: gpuAvailable
integer(kind=ik) :: error, mpi_comm_all integer(kind=ik) :: error, mpi_comm_all, use_gpu_id, min_use_gpu_id
!character(len=1024) :: envname !character(len=1024) :: envname
if (.not.(present(wantDebug))) then if (.not.(present(wantDebug))) then
...@@ -83,6 +83,49 @@ module mod_check_for_gpu ...@@ -83,6 +83,49 @@ module mod_check_for_gpu
stop stop
endif endif
if (obj%is_set("use_gpu_id") == 1) then
call obj%get("use_gpu_id", use_gpu_id, error)
if (use_gpu_id == -99) then
print *,"Problem you did not set which gpu id this task should use"
endif
! check whether gpu ud has been set for each proces
#ifdef WITH_MPI
call mpi_allreduce(use_gpu_id, min_use_gpu_id, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
if (min_use_gpu_id .lt. 0) then
print *,"Not all tasks have set which GPU id should be used"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
#endif
gpuAvailable = .true.
if (myid==0) then
if (wantDebugMessage) then
print *
print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
endif
endif
success = cuda_setdevice(use_gpu_id)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
success = cublas_create(cublasHandle)
if (.not.(success)) then
print *,"Cannot create cublas handle"
stop 1
endif
else
if (cublasHandle .ne. -1) then if (cublasHandle .ne. -1) then
gpuAvailable = .true. gpuAvailable = .true.
numberOfDevices = -1 numberOfDevices = -1
...@@ -146,5 +189,6 @@ module mod_check_for_gpu ...@@ -146,5 +189,6 @@ module mod_check_for_gpu
endif endif
endif
end function end function
end module end module
extern "C" {
int gpu_count() {
int count;
cudaError_t cuerr = cudaGetDeviceCount(&count);
if (cuerr != cudaSuccess) {
count = -1000;
}
return count;
}
}
...@@ -143,6 +143,16 @@ ...@@ -143,6 +143,16 @@
!> print *,"Could not setup ELPA object" !> print *,"Could not setup ELPA object"
!> endif !> endif
!> !>
!> ! settings for GPU
!> call elpaInstance%set("gpu", 1, success) ! 1=on, 2=off
!> ! in case of GPU usage you have the choice whether ELPA
!> ! should automatically assign each MPI task to a certain GPU
!> ! (this is default) or whether you want to set this assignment
!> ! for _each_ task yourself
!> ! set assignment your self (only using one task here and assigning it
!> ! to GPU id 1)
!> if (my_rank .eq. 0) call elpaInstance%set("use_gpu_id", 1, success)
!>
!> ! if desired, set tunable run-time options !> ! if desired, set tunable run-time options
!> ! here we want to use the 2-stage solver !> ! here we want to use the 2-stage solver
!> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success) !> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success)
...@@ -206,6 +216,16 @@ ...@@ -206,6 +216,16 @@
!> /* here we want to use the 2-stage solver */ !> /* here we want to use the 2-stage solver */
!> elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error); !> elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
!> !>
!> /* settings for GPU */
!> elpa_set(handle, "gpu", 1, &error); /* 1=on, 2=off */
!> /* in case of GPU usage you have the choice whether ELPA
!> should automatically assign each MPI task to a certain GPU
!> (this is default) or whether you want to set this assignment
!> for _each_ task yourself
!> set assignment your self (only using one task here and assigning it
!> to GPU id 1) */
!> if (my_rank == 0) elpa_set(handle, "use_gpu_id", 1, &error);
!>
!> elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error); !> elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error);
!> \endcode !> \endcode
!> ... set and get all other options that are desired !> ... set and get all other options that are desired
......
...@@ -112,6 +112,14 @@ static int min_tile_size_cardinality(elpa_index_t index); ...@@ -112,6 +112,14 @@ static int min_tile_size_cardinality(elpa_index_t index);
static int min_tile_size_enumerate(elpa_index_t index, int i); static int min_tile_size_enumerate(elpa_index_t index, int i);
static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value); static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value);
#ifdef WITH_GPU_VERSION
int gpu_count();
#endif
static int use_gpu_id_cardinality(elpa_index_t index);
static int use_gpu_id_enumerate(elpa_index_t index, int i);
static int use_gpu_id_is_valid(elpa_index_t index, int n, int new_value);
static int valid_with_gpu(elpa_index_t index, int n, int new_value); static int valid_with_gpu(elpa_index_t index, int n, int new_value);
static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value); static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value);
static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value); static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value);
...@@ -236,6 +244,8 @@ static const elpa_index_int_entry_t int_entries[] = { ...@@ -236,6 +244,8 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \ INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES), cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("use_gpu_id", "Calling MPI task will use this gpu id", -99, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
use_gpu_id_cardinality, use_gpu_id_enumerate, use_gpu_id_is_valid, NULL, PRINT_YES),
INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \ INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES), number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES),
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \ INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
...@@ -1093,6 +1103,40 @@ static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) { ...@@ -1093,6 +1103,40 @@ static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) {
} }
} }
static int use_gpu_id_cardinality(elpa_index_t index) {
#ifdef WITH_GPU_VERSION
int count;
count = gpu_count();
if (count == -1000) {
fprintf(stderr, "Querrying GPUs failed! Set GPU count = 0\n");
return 0;
}
return count;
#else
return 0;
#endif
}
static int use_gpu_id_enumerate(elpa_index_t index, int i) {
fprintf(stderr, "use_gpu_id_is_enumerate should never be called. please report this bug\n");
return i;
}
static int use_gpu_id_is_valid(elpa_index_t index, int n, int new_value) {
#ifdef WITH_GPU_VERSION
int count;
count = gpu_count();
if (count == -1000) {
fprintf(stderr, "Querrying GPUs failed! Return with error\n");
return 0 == 1 ;
} else {
return (0 <= new_value) && (new_value <= count);
}
#else
return 0 == 0;
#endif
}
// TODO: this shoudl definitely be improved (too many options to test in autotuning) // TODO: this shoudl definitely be improved (too many options to test in autotuning)
static const int TILE_SIZE_STEP = 128; static const int TILE_SIZE_STEP = 128;
......
...@@ -199,7 +199,9 @@ program test ...@@ -199,7 +199,9 @@ program test
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
TEST_INT_TYPE :: max_threads, threads_caller TEST_INT_TYPE :: max_threads, threads_caller
#endif #endif
#ifdef TEST_GPU_SET_ID
TEST_INT_TYPE :: gpuID
#endif
#ifdef SPLIT_COMM_MYSELF #ifdef SPLIT_COMM_MYSELF
TEST_INT_MPI_TYPE :: mpi_comm_rows, mpi_comm_cols, mpi_string_length, mpierr2 TEST_INT_MPI_TYPE :: mpi_comm_rows, mpi_comm_cols, mpi_string_length, mpierr2
character(len=MPI_MAX_ERROR_STRING) :: mpierr_string character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
...@@ -641,6 +643,16 @@ program test ...@@ -641,6 +643,16 @@ program test
call e%set("gpu", TEST_GPU, error_elpa) call e%set("gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa) assert_elpa_ok(error_elpa)
#ifdef TEST_GPU_SET_ID
! simple test
! Can (and should) fail often
gpuID = mod(myid,2)
print *,"Task",myid,"wants to use GPU",gpuID
call e%set("use_gpu_id", int(gpuID,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_QR_DECOMPOSITION == 1 #if TEST_QR_DECOMPOSITION == 1
call e%set("qr", 1_ik, error_elpa) call e%set("qr", 1_ik, error_elpa)
assert_elpa_ok(error_elpa) assert_elpa_ok(error_elpa)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment