Commit 834f142b authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'assign_tasks_to_gpu' into 'master_pre_stage'

Assign tasks to gpu

See merge request !55
parents 3a46cfb1 a198daba
......@@ -138,7 +138,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/elpa_index_gpu.cu src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif
if !WITH_MPI
......
......@@ -24,6 +24,10 @@ gpu_flag = {
0: "-DTEST_GPU=0",
1: "-DTEST_GPU=1",
}
gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0",
1: "-DTEST_GPU_SET_ID=1",
}
matrix_flag = {
"random": "-DTEST_MATRIX_RANDOM",
......@@ -57,9 +61,10 @@ split_comm_flag = {
"by_elpa": ""
}
for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(matrix_flag.keys()),
sorted(gpu_flag.keys()),
sorted(gpu_id_flag.keys()),
sorted(qr_flag.keys()),
sorted(test_type_flag.keys()),
sorted(prec_flag.keys()),
......@@ -68,6 +73,9 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(layout_flag.keys()),
sorted(split_comm_flag.keys())):
if gid == 1 and (g == 0 ):
continue
if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"):
continue
......@@ -178,11 +186,12 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
raise Exception("Oh no!")
endifs += 1
name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format(
name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{gpuidsuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format(
langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if g else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "",
m=m,
layoutsuffix="_all_layouts" if lay == "all_layouts" else "",
......@@ -227,6 +236,7 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
test_type_flag[t],
solver_flag[s],
gpu_flag[g],
gpu_id_flag[gid],
qr_flag[q],
matrix_flag[m]] + extra_flags))
......
......@@ -62,7 +62,7 @@ module mod_check_for_gpu
integer(kind=ik), intent(out) :: numberOfDevices
integer(kind=ik) :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable
integer(kind=ik) :: error, mpi_comm_all
integer(kind=ik) :: error, mpi_comm_all, use_gpu_id, min_use_gpu_id
!character(len=1024) :: envname
if (.not.(present(wantDebug))) then
......@@ -83,6 +83,49 @@ module mod_check_for_gpu
stop
endif
if (obj%is_set("use_gpu_id") == 1) then
call obj%get("use_gpu_id", use_gpu_id, error)
if (use_gpu_id == -99) then
print *,"Problem you did not set which gpu id this task should use"
endif
! check whether gpu ud has been set for each proces
#ifdef WITH_MPI
call mpi_allreduce(use_gpu_id, min_use_gpu_id, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
if (min_use_gpu_id .lt. 0) then
print *,"Not all tasks have set which GPU id should be used"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
#endif
gpuAvailable = .true.
if (myid==0) then
if (wantDebugMessage) then
print *
print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
endif
endif
success = cuda_setdevice(use_gpu_id)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
success = cublas_create(cublasHandle)
if (.not.(success)) then
print *,"Cannot create cublas handle"
stop 1
endif
else
if (cublasHandle .ne. -1) then
gpuAvailable = .true.
numberOfDevices = -1
......@@ -146,5 +189,6 @@ module mod_check_for_gpu
endif
endif
end function
end module
extern "C" {
int gpu_count() {
int count;
cudaError_t cuerr = cudaGetDeviceCount(&count);
if (cuerr != cudaSuccess) {
count = -1000;
}
return count;
}
}
......@@ -143,6 +143,16 @@
!> print *,"Could not setup ELPA object"
!> endif
!>
!> ! settings for GPU
!> call elpaInstance%set("gpu", 1, success) ! 1=on, 2=off
!> ! in case of GPU usage you have the choice whether ELPA
!> ! should automatically assign each MPI task to a certain GPU
!> ! (this is default) or whether you want to set this assignment
!> ! for _each_ task yourself
!> ! set assignment your self (only using one task here and assigning it
!> ! to GPU id 1)
!> if (my_rank .eq. 0) call elpaInstance%set("use_gpu_id", 1, success)
!>
!> ! if desired, set tunable run-time options
!> ! here we want to use the 2-stage solver
!> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success)
......@@ -206,6 +216,16 @@
!> /* here we want to use the 2-stage solver */
!> elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
!>
!> /* settings for GPU */
!> elpa_set(handle, "gpu", 1, &error); /* 1=on, 2=off */
!> /* in case of GPU usage you have the choice whether ELPA
!> should automatically assign each MPI task to a certain GPU
!> (this is default) or whether you want to set this assignment
!> for _each_ task yourself
!> set assignment your self (only using one task here and assigning it
!> to GPU id 1) */
!> if (my_rank == 0) elpa_set(handle, "use_gpu_id", 1, &error);
!>
!> elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error);
!> \endcode
!> ... set and get all other options that are desired
......
......@@ -112,6 +112,14 @@ static int min_tile_size_cardinality(elpa_index_t index);
static int min_tile_size_enumerate(elpa_index_t index, int i);
static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value);
#ifdef WITH_GPU_VERSION
int gpu_count();
#endif
static int use_gpu_id_cardinality(elpa_index_t index);
static int use_gpu_id_enumerate(elpa_index_t index, int i);
static int use_gpu_id_is_valid(elpa_index_t index, int n, int new_value);
static int valid_with_gpu(elpa_index_t index, int n, int new_value);
static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value);
static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value);
......@@ -236,6 +244,8 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("use_gpu_id", "Calling MPI task will use this gpu id", -99, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
use_gpu_id_cardinality, use_gpu_id_enumerate, use_gpu_id_is_valid, NULL, PRINT_YES),
INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES),
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
......@@ -1093,6 +1103,40 @@ static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) {
}
}
static int use_gpu_id_cardinality(elpa_index_t index) {
#ifdef WITH_GPU_VERSION
int count;
count = gpu_count();
if (count == -1000) {
fprintf(stderr, "Querrying GPUs failed! Set GPU count = 0\n");
return 0;
}
return count;
#else
return 0;
#endif
}
static int use_gpu_id_enumerate(elpa_index_t index, int i) {
fprintf(stderr, "use_gpu_id_is_enumerate should never be called. please report this bug\n");
return i;
}
static int use_gpu_id_is_valid(elpa_index_t index, int n, int new_value) {
#ifdef WITH_GPU_VERSION
int count;
count = gpu_count();
if (count == -1000) {
fprintf(stderr, "Querrying GPUs failed! Return with error\n");
return 0 == 1 ;
} else {
return (0 <= new_value) && (new_value <= count);
}
#else
return 0 == 0;
#endif
}
// TODO: this shoudl definitely be improved (too many options to test in autotuning)
static const int TILE_SIZE_STEP = 128;
......
......@@ -199,7 +199,9 @@ program test
#ifdef WITH_OPENMP_TRADITIONAL
TEST_INT_TYPE :: max_threads, threads_caller
#endif
#ifdef TEST_GPU_SET_ID
TEST_INT_TYPE :: gpuID
#endif
#ifdef SPLIT_COMM_MYSELF
TEST_INT_MPI_TYPE :: mpi_comm_rows, mpi_comm_cols, mpi_string_length, mpierr2
character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
......@@ -641,6 +643,16 @@ program test
call e%set("gpu", TEST_GPU, error_elpa)
assert_elpa_ok(error_elpa)
#ifdef TEST_GPU_SET_ID
! simple test
! Can (and should) fail often
gpuID = mod(myid,2)
print *,"Task",myid,"wants to use GPU",gpuID
call e%set("use_gpu_id", int(gpuID,kind=c_int), error_elpa)
assert_elpa_ok(error_elpa)
#endif
#if TEST_QR_DECOMPOSITION == 1
call e%set("qr", 1_ik, error_elpa)
assert_elpa_ok(error_elpa)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment