diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a657421626617b8d236bc4c7e46b6b69cedfdf0..514f120daf28a512659a12b6e86d5fe64f6c9c85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,6 +113,7 @@ endif() ###Add Libraries if (CUDA_FOUND) cuda_add_cufft_to_target(bioEM) + target_link_libraries(bioEM ${CUDA_CUDA_LIBRARY}) endif() target_link_libraries(bioEM -L${FFTW_LIBDIR} -lfftw3 -lfftw3f) diff --git a/bioem_cuda.cu b/bioem_cuda.cu index e412dfd52631f276017c36bb06b97269cfe8f1d6..18d4369d7abbd9f079fee04d554b39d38a162646 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -272,9 +272,78 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map return(0); } +int bioem_cuda::selectCudaDevice() +{ + int count; + + long long int bestDeviceSpeed = -1; + int bestDevice; + cudaDeviceProp deviceProp; + + checkCudaErrors(cudaGetDeviceCount(&count)); + if (count == 0) + { + printf("No CUDA device detected\n"); + return(1); + } + for (int i = 0;i < count;i++) + { + printf("CUDA device %d\n", i); +#if CUDA_VERSION > 3010 + size_t free, total; +#else + unsigned int free, total; +#endif + cuInit(0); + CUdevice tmpDevice; + cuDeviceGet(&tmpDevice, i); + CUcontext tmpContext; + cuCtxCreate(&tmpContext, 0, tmpDevice); + if(cuMemGetInfo(&free, &total)) exit(1); + cuCtxDestroy(tmpContext); + if (DebugOutput >= 1) printf("Obtained current memory usage for device %d\n", i); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); + if (DebugOutput >= 1) printf("Obtained device properties for device %d\n", i); + + if (DebugOutput >= 1) printf("%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)", i, deviceProp.name, deviceProp.major, deviceProp.minor, (long long int) free, (long long int) deviceProp.totalGlobalMem); + long long int deviceSpeed = (long long int) deviceProp.multiProcessorCount * (long long int) deviceProp.clockRate * (long long int) deviceProp.warpSize; + if (deviceSpeed > bestDeviceSpeed) + { + bestDevice = i; + bestDeviceSpeed = deviceSpeed; + } + } + + cudaGetDeviceProperties(&deviceProp ,bestDevice); + + if (DebugOutput >= 1) + { + printf("Using CUDA Device %s with Properties:", deviceProp.name); + printf("totalGlobalMem = %lld", (unsigned long long int) deviceProp.totalGlobalMem); + printf("sharedMemPerBlock = %lld", (unsigned long long int) deviceProp.sharedMemPerBlock); + printf("regsPerBlock = %d", deviceProp.regsPerBlock); + printf("warpSize = %d", deviceProp.warpSize); + printf("memPitch = %lld", (unsigned long long int) deviceProp.memPitch); + printf("maxThreadsPerBlock = %d", deviceProp.maxThreadsPerBlock); + printf("maxThreadsDim = %d %d %d", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); + printf("maxGridSize = %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); + printf("totalConstMem = %lld", (unsigned long long int) deviceProp.totalConstMem); + printf("major = %d", deviceProp.major); + printf("minor = %d", deviceProp.minor); + printf("clockRate = %d", deviceProp.clockRate); + printf("memoryClockRate = %d", deviceProp.memoryClockRate); + printf("multiProcessorCount = %d", deviceProp.multiProcessorCount); + printf("textureAlignment = %lld", (unsigned long long int) deviceProp.textureAlignment); + } + + return(0); +} + int bioem_cuda::deviceInit() { deviceExit(); + + if (FFTAlgo) GPUAlgo = 2; @@ -471,5 +540,14 @@ void bioem_cuda::free_device_host(void* ptr) bioem* bioem_cuda_create() { + int count; + + if (cudaGetDeviceCount(&count) != cudaSuccess) count = 0; + if (count == 0) + { + printf("No CUDA device available, using fallback to CPU version\n"); + return new bioem; + } + return new bioem_cuda; } diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index 67d1206fd8e4dc184e3183401f486f3f88f76d21..9aaa689d7aac7d445008cae1f2aaf2e6f5508ea1 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -26,6 +26,9 @@ protected: virtual int deviceStartRun(); virtual int deviceFinishRun(); int deviceExit(); + +private: + int selectCudaDevice(); int deviceInitialized;