From c4d1dff7816f11c82b588b683a893dce27ff2f5d Mon Sep 17 00:00:00 2001 From: Luka Stanisic Date: Fri, 16 Jun 2017 10:33:56 +0200 Subject: [PATCH] checking CUDA driver errors --- bioem_cuda.cu | 8 ++-- include/bioem_cuda_internal.h | 71 +++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/bioem_cuda.cu b/bioem_cuda.cu index aa70fb7..26535b3 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -297,13 +297,13 @@ int bioem_cuda::selectCudaDevice() #else unsigned int free, total; #endif - cuInit(0); + CU_ERROR_CHECK(cuInit(0)); CUdevice tmpDevice; - cuDeviceGet(&tmpDevice, i); + CU_ERROR_CHECK(cuDeviceGet(&tmpDevice, i)); CUcontext tmpContext; - cuCtxCreate(&tmpContext, 0, tmpDevice); + CU_ERROR_CHECK(cuCtxCreate(&tmpContext, 0, tmpDevice)); if(cuMemGetInfo(&free, &total)) exit(1); - cuCtxDestroy(tmpContext); + CU_ERROR_CHECK(cuCtxDestroy(tmpContext)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); if (DebugOutput >= 2 && mpi_rank == 0) printf("CUDA Device %2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)\n", i, deviceProp.name, deviceProp.major, deviceProp.minor, (long long int) free, (long long int) deviceProp.totalGlobalMem); diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index 4d44c5e..60939da 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -72,5 +72,76 @@ private: int maxRef; }; +/* Handing CUDA Driver errors */ +/* Inspired from: https://github.com/garymacindoe/cuda-cholesky */ + +// Expand and stringify argument +#define STRINGx(x) #x +#define STRING(x) STRINGx(x) + +const char * cuGetError(CUresult result) { + switch (result) { + case CUDA_SUCCESS: return "No errors"; + case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; + case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; + case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized"; + case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized"; + case CUDA_ERROR_PROFILER_DISABLED: return "Profiler disabled"; + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "Profiler not initialized"; + case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "Profiler already started"; + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "Profiler already stopped"; + case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available"; + case CUDA_ERROR_INVALID_DEVICE: return "Invalid device"; + case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image"; + case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context"; + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current"; + case CUDA_ERROR_MAP_FAILED: return "Map failed"; + case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed"; + case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped"; + case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped"; + case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU"; + case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired"; + case CUDA_ERROR_NOT_MAPPED: return "Not mapped"; + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Not mapped as array"; + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Not mapped as pointer"; + case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error"; + case CUDA_ERROR_UNSUPPORTED_LIMIT: return "Unsupported CUlimit"; + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "Context already in use"; + case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; + case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Shared object symbol not found"; + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed"; + case CUDA_ERROR_OPERATING_SYSTEM: return "Operating System call failed"; + case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle"; + case CUDA_ERROR_NOT_FOUND: return "Not found"; + case CUDA_ERROR_NOT_READY: return "CUDA not ready"; + case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources"; + case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout"; + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing"; + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "Peer access already enabled"; + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "Peer access not enabled"; + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "Primary context active"; + case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "Context is destroyed"; + case CUDA_ERROR_ASSERT: return "Device assert failed"; + case CUDA_ERROR_TOO_MANY_PEERS: return "Too many peers"; + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "Host memory already registered"; + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "Host memory not registered"; + case CUDA_ERROR_UNKNOWN: return "Unknown error"; + default: return "Unknown error code"; + } +} + +#define CU_ERROR_CHECK(call) \ + do { \ + CUresult __error__; \ + if ((__error__ = (call)) != CUDA_SUCCESS) { \ + printf(STRING(call), __func__, __FILE__, __LINE__, __error__, \ + (const char * (*)(int))cuGetError); \ + return __error__; \ + } \ + } while (false) + + #endif -- GitLab