Commit ec5b3bec authored by Pavel Kus's avatar Pavel Kus
Browse files

partially addressing issues with the GPU kernel

This commit addresses several issues. It essentially forbids the use of
the GPU kernel, which become obsolete and caused problems. But it
does not complete remove the related code, nor does it forbid from
explicitly selecting the GPU kernel. However, if the user does select
it, the warning will be issued and the GENERIC kernel would be used
instead. In the more details:
* Commentin out operations in the GPU kernel, which do not compile with
  CUDA 10.1. This makes the kernel deffinitely not ussable (but it was
  true even before)
* removing the gpu_tridiag_band option, sincie the tridiag->banded routine
  is actually not ported to GPU at all. This step will thus always be
  run on the CPU
* removing the gpu_trans_ev_tridi_to_band option, since the GPU version
  of this step cannot run without the GPU kernel and it is not usable.
  This step will thus also be performed on the CPU
* modifying REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE and
  COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE such that the GPU kernel is
  not considered during the autotuning

* TODO however, the GPU kernel can still be enforced by the user. In
  this case, during the calculation, a warning is issued and the kernel
  is switched to the GENERIC one. This should be improved and there
  should not even be the possibility to choose the GPU kernel at the
  begining.
parent 67a41d2e
......@@ -50,6 +50,7 @@
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuComplex.h>
......@@ -95,10 +96,16 @@ static __device__ __forceinline__ double shfl_down_complex_double(double r, int
static __device__ __forceinline__ float shfl_down_complex_single(float r, int offset)
#endif
{
int hi = __shfl_down(__double2hiint(r), offset);
int lo = __shfl_down(__double2loint(r), offset);
// The following operations do not exist in CUDA 10.1 any more
// It has been commented out. The code is still compiled, but not used
// TODO do it properly
return __hiloint2double(hi, lo);
assert(0);
//int hi = __shfl_down(__double2hiint(r), offset);
//int lo = __shfl_down(__double2loint(r), offset);
//return __hiloint2double(hi, lo);
return 0.;
}
#ifdef DOUBLE_PRECISION_COMPLEX
......
......@@ -50,6 +50,7 @@
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include "config-f90.h"
......@@ -64,10 +65,16 @@ static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask
static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
#endif
{
int hi = __shfl_xor(__double2hiint(r), mask);
int lo = __shfl_xor(__double2loint(r), mask);
// The following operations do not exist in CUDA 10.1 any more
// It has been commented out. The code is still compiled, but not used
// TODO do it properly
assert(0);
return __hiloint2double(hi, lo);
// int hi = __shfl_xor(__double2hiint(r), mask);
// int lo = __shfl_xor(__double2loint(r), mask);
//
// return __hiloint2double(hi, lo);
return 0.;
}
// Perform the equivalent of "__shfl_down" on an 8-byte value
......@@ -77,10 +84,16 @@ static __device__ __forceinline__ double shfl_down_real_double(double r, int off
static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
#endif
{
int hi = __shfl_down(__double2hiint(r), offset);
int lo = __shfl_down(__double2loint(r), offset);
// The following operations do not exist in CUDA 10.1 any more
// It has been commented out. The code is still compiled, but not used
// TODO do it properly
assert(0);
return __hiloint2double(hi, lo);
// int hi = __shfl_down(__double2hiint(r), offset);
// int lo = __shfl_down(__double2loint(r), offset);
//
// return __hiloint2double(hi, lo);
return 0.;
}
// Perform a reduction on a warp or the first part of it
......
......@@ -282,9 +282,11 @@
endif
do_useGPU_bandred = do_useGPU
do_useGPU_tridiag_band = do_useGPU
! tridiag-band not ported to GPU yet
do_useGPU_tridiag_band = .false.
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev_tridi_to_band = do_useGPU
! trans tridi to band GPU implementation does not work properly
do_useGPU_trans_ev_tridi_to_band = .false.
do_useGPU_trans_ev_band_to_full = do_useGPU
! only if we want (and can) use GPU in general, look what are the
......@@ -299,12 +301,14 @@
endif
do_useGPU_bandred = (gpu == 1)
call obj%get("gpu_tridiag_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_tridiag_band = (gpu == 1)
! call obj%get("gpu_tridiag_band", gpu, error)
! if (error .ne. ELPA_OK) then
! print *,"Problem getting option. Aborting..."
! stop
! endif
! do_useGPU_tridiag_band = (gpu == 1)
! tridiag-band not ported to GPU yet
do_useGPU_tridiag_band = .false.
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
......@@ -313,12 +317,13 @@
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_trans_ev_tridi_to_band = (gpu == 1)
! call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
! if (error .ne. ELPA_OK) then
! print *,"Problem getting option. Aborting..."
! stop
! endif
! do_useGPU_trans_ev_tridi_to_band = (gpu == 1)
do_useGPU_trans_ev_tridi_to_band = .false.
call obj%get("gpu_trans_ev_band_to_full", gpu, error)
if (error .ne. ELPA_OK) then
......@@ -329,16 +334,28 @@
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
!!! this currently cannot happen, GPU_trans_ev_tridi_to_band is always false
write(error_unit,*) "ELPA: internal error!"
stop
! if (kernel .ne. GPU_KERNEL) then
! write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
! write(error_unit,*) "The compute kernel will be executed on CPUs!"
! do_useGPU_trans_ev_tridi_to_band = .false.
! else if (nblk .ne. 128) then
! write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
! write(error_unit,*) "The compute kernel will be executed on CPUs!"
! do_useGPU_trans_ev_tridi_to_band = .false.
! kernel = GENERIC_KERNEL
! endif
else
if (kernel .eq. GPU_KERNEL) then
! We have currently forbidden to use GPU version of trans ev tridi to band, but we did not forbid the possibility
! to select the GPU kernel. If done such, give warning and swicht to the generic kernel
! TODO it would be better to forbid the possibility to set the GPU kernel completely
write(error_unit,*) "ELPA: ERROR, GPU kernel currently not implemented.&
& Use optimized CPU kernel even for GPU runs! &
Switching to the non-optimized generic kernel"
kernel = GENERIC_KERNEL
endif
endif
......@@ -346,16 +363,19 @@
! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
! finally consistent
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
endif
!!! this currently cannot happen, GPU_trans_ev_tridi_to_band is always false
write(error_unit,*) "ELPA: internal error!"
stop
! if (kernel .ne. GPU_KERNEL) then
! ! this should never happen, checking as an assert
! write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
! stop
! endif
! if (nblk .ne. 128) then
! ! this should never happen, checking as an assert
! write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
! stop
! endif
else
if (kernel .eq. GPU_KERNEL) then
! combination not allowed
......
......@@ -206,10 +206,12 @@ static const elpa_index_int_entry_t int_entries[] = {
cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
// the routine has not been ported to GPU yet
// INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
// cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
// the GPU implementation of this routine (together with the kernel) has been abandoned
// INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
// cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
......@@ -643,7 +645,9 @@ static const char *real_kernel_name(int kernel) {
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1
// currently the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......@@ -682,7 +686,9 @@ static const char *complex_kernel_name(int kernel) {
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1
// currenttly the GPU kernel is never valid
// previously: kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment