From c4d1dff7816f11c82b588b683a893dce27ff2f5d Mon Sep 17 00:00:00 2001
From: Luka Stanisic <luka.stanisic@mpcdf.mpg.de>
Date: Fri, 16 Jun 2017 10:33:56 +0200
Subject: [PATCH] checking CUDA driver errors

---
 bioem_cuda.cu                 |  8 ++--
 include/bioem_cuda_internal.h | 71 +++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index aa70fb7..26535b3 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -297,13 +297,13 @@ int bioem_cuda::selectCudaDevice()
 #else
 		unsigned int free, total;
 #endif
-		cuInit(0);
+		CU_ERROR_CHECK(cuInit(0));
 		CUdevice tmpDevice;
-		cuDeviceGet(&tmpDevice, i);
+		CU_ERROR_CHECK(cuDeviceGet(&tmpDevice, i));
 		CUcontext tmpContext;
-		cuCtxCreate(&tmpContext, 0, tmpDevice);
+		CU_ERROR_CHECK(cuCtxCreate(&tmpContext, 0, tmpDevice));
 		if(cuMemGetInfo(&free, &total)) exit(1);
-		cuCtxDestroy(tmpContext);
+		CU_ERROR_CHECK(cuCtxDestroy(tmpContext));
 		checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
 
 		if (DebugOutput >= 2 && mpi_rank == 0) printf("CUDA Device %2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)\n", i, deviceProp.name, deviceProp.major, deviceProp.minor, (long long int) free, (long long int) deviceProp.totalGlobalMem);
diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h
index 4d44c5e..60939da 100644
--- a/include/bioem_cuda_internal.h
+++ b/include/bioem_cuda_internal.h
@@ -72,5 +72,76 @@ private:
 	int maxRef;
 };
 
+/* Handing CUDA Driver errors */
+/* Inspired from: https://github.com/garymacindoe/cuda-cholesky */
+
+// Expand and stringify argument
+#define STRINGx(x) #x
+#define STRING(x) STRINGx(x)
+
+const char * cuGetError(CUresult result) {
+  switch (result) {
+    case CUDA_SUCCESS:                              return "No errors";
+    case CUDA_ERROR_INVALID_VALUE:                  return "Invalid value";
+    case CUDA_ERROR_OUT_OF_MEMORY:                  return "Out of memory";
+    case CUDA_ERROR_NOT_INITIALIZED:                return "Driver not initialized";
+    case CUDA_ERROR_DEINITIALIZED:                  return "Driver deinitialized";
+    case CUDA_ERROR_PROFILER_DISABLED:              return "Profiler disabled";
+    case CUDA_ERROR_PROFILER_NOT_INITIALIZED:       return "Profiler not initialized";
+    case CUDA_ERROR_PROFILER_ALREADY_STARTED:       return "Profiler already started";
+    case CUDA_ERROR_PROFILER_ALREADY_STOPPED:       return "Profiler already stopped";
+    case CUDA_ERROR_NO_DEVICE:                      return "No CUDA-capable device available";
+    case CUDA_ERROR_INVALID_DEVICE:                 return "Invalid device";
+    case CUDA_ERROR_INVALID_IMAGE:                  return "Invalid kernel image";
+    case CUDA_ERROR_INVALID_CONTEXT:                return "Invalid context";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:        return "Context already current";
+    case CUDA_ERROR_MAP_FAILED:                     return "Map failed";
+    case CUDA_ERROR_UNMAP_FAILED:                   return "Unmap failed";
+    case CUDA_ERROR_ARRAY_IS_MAPPED:                return "Array is mapped";
+    case CUDA_ERROR_ALREADY_MAPPED:                 return "Already mapped";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU:              return "No binary for GPU";
+    case CUDA_ERROR_ALREADY_ACQUIRED:               return "Already acquired";
+    case CUDA_ERROR_NOT_MAPPED:                     return "Not mapped";
+    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:            return "Not mapped as array";
+    case CUDA_ERROR_NOT_MAPPED_AS_POINTER:          return "Not mapped as pointer";
+    case CUDA_ERROR_ECC_UNCORRECTABLE:              return "Uncorrectable ECC error";
+    case CUDA_ERROR_UNSUPPORTED_LIMIT:              return "Unsupported CUlimit";
+    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:         return "Context already in use";
+    case CUDA_ERROR_INVALID_SOURCE:                 return "Invalid source";
+    case CUDA_ERROR_FILE_NOT_FOUND:                 return "File not found";
+    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Shared object symbol not found";
+    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:      return "Shared object initialization failed";
+    case CUDA_ERROR_OPERATING_SYSTEM:               return "Operating System call failed";
+    case CUDA_ERROR_INVALID_HANDLE:                 return "Invalid handle";
+    case CUDA_ERROR_NOT_FOUND:                      return "Not found";
+    case CUDA_ERROR_NOT_READY:                      return "CUDA not ready";
+    case CUDA_ERROR_LAUNCH_FAILED:                  return "Launch failed";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:        return "Launch exceeded resources";
+    case CUDA_ERROR_LAUNCH_TIMEOUT:                 return "Launch exceeded timeout";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:  return "Launch with incompatible texturing";
+    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:    return "Peer access already enabled";
+    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:        return "Peer access not enabled";
+    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:         return "Primary context active";
+    case CUDA_ERROR_CONTEXT_IS_DESTROYED:           return "Context is destroyed";
+    case CUDA_ERROR_ASSERT:                         return "Device assert failed";
+    case CUDA_ERROR_TOO_MANY_PEERS:                 return "Too many peers";
+    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "Host memory already registered";
+    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:     return "Host memory not registered";
+    case CUDA_ERROR_UNKNOWN:                        return "Unknown error";
+    default:                                        return "Unknown error code";
+  }
+}
+
+#define CU_ERROR_CHECK(call) \
+  do { \
+    CUresult __error__; \
+    if ((__error__ = (call)) != CUDA_SUCCESS) { \
+      printf(STRING(call), __func__, __FILE__, __LINE__, __error__, \
+                     (const char * (*)(int))cuGetError); \
+      return __error__; \
+    } \
+  } while (false)
+
+
 #endif
 
-- 
GitLab