/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ < BioEM software for Bayesian inference of Electron Microscopy images> Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer. Max Planck Institute of Biophysics, Frankfurt, Germany. See license statement for terms of distribution. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ #define BIOEM_GPUCODE #if defined(_WIN32) #include #endif #include using namespace std; #include "bioem_cuda_internal.h" #include "bioem_algorithm.h" #define checkCudaErrors(error) \ { \ if ((error) != cudaSuccess) \ { \ printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \ exit(1); \ } \ } static const char *cufftGetErrorStrung(cufftResult error) { switch (error) { case CUFFT_SUCCESS: return "CUFFT_SUCCESS"; case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN"; case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED"; case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE"; case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE"; case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR"; case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED"; case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED"; case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE"; case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; } return "UNKNOWN"; } bioem_cuda::bioem_cuda() { deviceInitialized = 0; GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC")); GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM")); } bioem_cuda::~bioem_cuda() { deviceExit(); } __global__ void compareRefMap_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int cent_x, const int cent_y, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { compareRefMap<0>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap, cent_x, cent_y); } } __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { compareRefMapShifted<1>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap); } } __global__ void cudaZeroMem(void* ptr, size_t size) { int* myptr = (int*) ptr; int mysize = size / sizeof(int); int myid = myBlockDimX * myBlockIdxX + myThreadIdxX; int mygrid = myBlockDimX * myGridDimX; for (int i = myid; i < mysize; i += mygrid) myptr[i] = 0; } __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) { const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const int iRefMap = myid >> (nShiftBits << 1); const int myRef = myThreadIdxX >> (nShiftBits << 1); const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1); const int myShiftIdy = myid & (nShifts - 1); const int myShift = myid & (nShifts * nShifts - 1); const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter; const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter; const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef; compareRefMap<2>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); } __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset) { if (myBlockIdxX >= NumberMaps) return; const mycuComplex_t* myin = (mycuComplex_t*) &refmap[(myBlockIdxX + Offset) * MapSize]; const mycuComplex_t* myconv = (mycuComplex_t*) convmap; mycuComplex_t* myout = &out[myBlockIdxX * MapSize]; for(int i = myThreadIdxX; i < NumberPixelsTotal; i += myBlockDimX) { mycuComplex_t val; const mycuComplex_t conv = myconv[i]; const mycuComplex_t in = myin[i]; val.x = conv.x * in.x + conv.y * in.y; val.y = conv.y * in.x - conv.x * in.y; myout[i] = val; } } __global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset) { if (myBlockIdxX * myBlockDimX + myThreadIdxX >= maxRef) return; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset; const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels]; doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, RefMap); } template static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));} #if defined(_WIN32) static inline int ilog2 (int value) { DWORD index; _BitScanReverse (&index, value); return(value); } #else static inline int ilog2(int value) {return 31 - __builtin_clz(value);} #endif int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) { if (startMap) { cout << "Error startMap not implemented for GPU Code\n"; exit(1); } if (GPUAsync) { checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); } if (FFTAlgo) { memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t)); checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0])); if (GPUAsync) { checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2])); checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0)); } if (GPUDualStream) { checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0])); checkCudaErrors(cudaStreamWaitEvent(cudaStream[1], cudaFFTEvent[0], 0)); } for (int i = 0, j = 0; i < maxRef; i += CUDA_FFTS_AT_ONCE, j++) { if (!GPUDualStream) j = 0; const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i); multComplexMap<<>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2[j & 1], param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i); cufftResult err = mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1][j & 1] : plan[0][j & 1], pFFTtmp2[j & 1], pFFTtmp[j & 1]); if (err != CUFFT_SUCCESS) { cout << "Error running CUFFT " << cufftGetErrorStrung(err) << "\n"; exit(1); } cuDoRefMapsFFT<<>>(iOrient, iConv, pFFTtmp[j & 1], sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i); } checkCudaErrors(cudaGetLastError()); if (GPUDualStream) { checkCudaErrors(cudaEventRecord(cudaFFTEvent[1], cudaStream[1])); checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaFFTEvent[1], 0)); } } else { checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0])); if (GPUAlgo == 2) //Loop over shifts { const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; if (!IsPowerOf2(nShifts)) { cout << "Invalid number of displacements, no power of two\n"; exit(1); } if (CUDA_THREAD_COUNT % (nShifts * nShifts)) { cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n"; exit(1); } if (nShifts > CUDA_MAX_SHIFT_REDUCE) { cout << "Too many displacements for CUDA reduction\n"; exit(1); } const int nShiftBits = ilog2(nShifts); size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT); size_t nBlocks = CUDA_BLOCK_COUNT; for (size_t i = 0; i < totalBlocks; i += nBlocks) { compareRefMapLoopShifts_kernel<<>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef); } } else if (GPUAlgo == 1) //Split shifts in multiple kernels { for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x = cent_x + param.param_device.GridSpaceCenter) { for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y = cent_y + param.param_device.GridSpaceCenter) { compareRefMap_kernel<<>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, cent_x, cent_y, maxRef); } } } else if (GPUAlgo == 0) //All shifts in one kernel { compareRefMapShifted_kernel<<>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, maxRef); } else { cout << "Invalid GPU Algorithm selected\n"; exit(1); } } if (GPUWorkload < 100) { bioem::compareRefMaps(iOrient, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef); } if (GPUAsync) { checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0])); } else { checkCudaErrors(cudaStreamSynchronize(cudaStream[0])); } return(0); } int bioem_cuda::selectCudaDevice() { int count; long long int bestDeviceSpeed = -1; int bestDevice; cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceCount(&count)); if (count == 0) { printf("No CUDA device detected\n"); return(1); } for (int i = 0;i < count;i++) { #if CUDA_VERSION > 3010 size_t free, total; #else unsigned int free, total; #endif cuInit(0); CUdevice tmpDevice; cuDeviceGet(&tmpDevice, i); CUcontext tmpContext; cuCtxCreate(&tmpContext, 0, tmpDevice); if(cuMemGetInfo(&free, &total)) exit(1); cuCtxDestroy(tmpContext); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); if (DebugOutput >= 2 && mpi_rank == 0) printf("CUDA Device %2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)\n", i, deviceProp.name, deviceProp.major, deviceProp.minor, (long long int) free, (long long int) deviceProp.totalGlobalMem); long long int deviceSpeed = (long long int) deviceProp.multiProcessorCount * (long long int) deviceProp.clockRate * (long long int) deviceProp.warpSize; if (deviceSpeed > bestDeviceSpeed) { bestDevice = i; bestDeviceSpeed = deviceSpeed; } } if (getenv("GPUDEVICE")) { int device = atoi(getenv("GPUDEVICE")); if (device > count) { printf("Invalid CUDA device specified, max device number is %d\n", count); exit(1); } #ifdef WITH_MPI if (device == -1) { device = mpi_rank % count; } #endif if (device < 0) { printf("Negative CUDA device specified: %d, invalid!\n", device); } bestDevice = device; } checkCudaErrors(cudaSetDevice(bestDevice)); cudaGetDeviceProperties(&deviceProp ,bestDevice); if (DebugOutput >= 3) { printf("Using CUDA Device %s with Properties:\n", deviceProp.name); printf("totalGlobalMem = %lld\n", (unsigned long long int) deviceProp.totalGlobalMem); printf("sharedMemPerBlock = %lld\n", (unsigned long long int) deviceProp.sharedMemPerBlock); printf("regsPerBlock = %d\n", deviceProp.regsPerBlock); printf("warpSize = %d\n", deviceProp.warpSize); printf("memPitch = %lld\n", (unsigned long long int) deviceProp.memPitch); printf("maxThreadsPerBlock = %d\n", deviceProp.maxThreadsPerBlock); printf("maxThreadsDim = %d %d %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf("maxGridSize = %d %d %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf("totalConstMem = %lld\n", (unsigned long long int) deviceProp.totalConstMem); printf("major = %d\n", deviceProp.major); printf("minor = %d\n", deviceProp.minor); printf("clockRate = %d\n", deviceProp.clockRate); printf("memoryClockRate = %d\n", deviceProp.memoryClockRate); printf("multiProcessorCount = %d\n", deviceProp.multiProcessorCount); printf("textureAlignment = %lld\n", (unsigned long long int) deviceProp.textureAlignment); } if (DebugOutput >= 1) { printf("BioEM for CUDA initialized (MPI Rank %d), %d GPUs found, using GPU %d\n", mpi_rank, count, bestDevice); } return(0); } int bioem_cuda::deviceInit() { deviceExit(); selectCudaDevice(); if (FFTAlgo) GPUAlgo = 2; gpumap = new bioem_RefMap; memcpy(gpumap, &RefMap, sizeof(bioem_RefMap)); if (FFTAlgo == 0) { checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize)); if (GPUAlgo == 0 || GPUAlgo == 1) { pRefMap_device_Mod = (bioem_RefMap_Mod*) gpumap; bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod; RefMapGPU->init(RefMap); checkCudaErrors(cudaMemcpy(maps, RefMapGPU->maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice)); delete RefMapGPU; } else { checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice)); } } checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap)); checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap)); checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); gpumap->maps = maps; gpumap->sum_RefMap = sum; gpumap->sumsquare_RefMap = sumsquare; checkCudaErrors(cudaMalloc(&pProb_memory, pProb_device.get_size(RefMap.ntotRefMap, param.nTotGridAngles, param.param_device.writeAngles))); checkCudaErrors(cudaMalloc(&pProb_memory, pProb_device.get_sizeCC(RefMap.ntotRefMap, param.nTotCC, param.param_device.writeCC))); for (int i = 0; i < 2; i++) { checkCudaErrors(cudaStreamCreate(&cudaStream[i])); checkCudaErrors(cudaEventCreate(&cudaEvent[i])); checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i])); checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize)); } if (GPUAsync) { checkCudaErrors(cudaStreamCreate(&cudaStream[2])); checkCudaErrors(cudaEventCreate(&cudaEvent[2])); } if (FFTAlgo) { checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp2[0], CUDA_FFTS_AT_ONCE * param.FFTMapSize * 2 * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp[0], CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * 2 * sizeof(myfloat_t))); pFFTtmp2[1] = pFFTtmp2[0] + CUDA_FFTS_AT_ONCE * param.FFTMapSize; pFFTtmp[1] = pFFTtmp[0] + CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels; checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2)); checkCudaErrors(cudaHostAlloc(&pConvMapFFT_Host, param.FFTMapSize * sizeof(mycomplex_t) * 2, 0)); checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice)); } deviceInitialized = 1; return(0); } int bioem_cuda::deviceExit() { if (deviceInitialized == 0) return(0); cudaFree(pProb_memory); cudaFree(sum); cudaFree(sumsquare); for (int i = 0; i < 2; i++) { cudaStreamDestroy(cudaStream[i]); cudaEventDestroy(cudaEvent[i]); cudaEventDestroy(cudaFFTEvent[i]); cudaFree(pConvMap_device[i]); } if (FFTAlgo) { cudaFree(pRefMapsFFT); cudaFree(pConvMapFFT); cudaFreeHost(pConvMapFFT_Host); cudaFree(pFFTtmp[0]); cudaFree(pFFTtmp2[0]); } else { cudaFree(maps); } if (GPUAlgo == 0 || GPUAlgo == 1) { cudaFree(pRefMap_device_Mod); } if (GPUAsync) { cudaStreamDestroy(cudaStream[2]); cudaEventDestroy(cudaEvent[2]); } delete gpumap; cudaThreadExit(); deviceInitialized = 0; return(0); } int bioem_cuda::deviceStartRun() { if (GPUWorkload >= 100) { maxRef = RefMap.ntotRefMap; pProb_host = &pProb; } else { maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100; pProb_host = new bioem_Probability; pProb_host->init(maxRef, param.nTotGridAngles, *this); pProb_host->initCC(maxRef, param.nTotCC, *this); pProb_host->copyFrom(&pProb, *this); } pProb_device = *pProb_host; pProb_device.ptr = pProb_memory; pProb_device.set_pointers(); checkCudaErrors(cudaMemcpyAsync(pProb_device.ptr, pProb_host->ptr, pProb_host->get_size(maxRef, param.nTotGridAngles, param.param_device.writeAngles), cudaMemcpyHostToDevice, cudaStream[0])); if (FFTAlgo) { for (int j = 0;j < 2;j++) { for (int i = 0; i < 2; i++) { if (i && maxRef % CUDA_FFTS_AT_ONCE == 0) continue; int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels}; if (cufftPlanMany(&plan[i][j], 2, n, NULL, 1, param.FFTMapSize, NULL, 1, 0, MY_CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS) { cout << "Error planning CUFFT\n"; exit(1); } if (cufftSetCompatibilityMode(plan[i][j], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS) { cout << "Error planning CUFFT compatibility\n"; exit(1); } if (cufftSetStream(plan[i][j], cudaStream[j]) != CUFFT_SUCCESS) { cout << "Error setting CUFFT stream\n"; exit(1); } } if (!GPUDualStream) break; } } return(0); } int bioem_cuda::deviceFinishRun() { if (GPUAsync) cudaStreamSynchronize(cudaStream[0]); checkCudaErrors(cudaMemcpyAsync(pProb_host->ptr, pProb_device.ptr, pProb_host->get_size(maxRef, param.nTotGridAngles, param.param_device.writeAngles), cudaMemcpyDeviceToHost, cudaStream[0])); if (FFTAlgo) { for (int j = 0;j < 2;j++) { for (int i = 0; i < 2; i++) { if (i && maxRef % CUDA_FFTS_AT_ONCE == 0) continue; cufftDestroy(plan[i][j]); } if (!GPUDualStream) break; } } cudaThreadSynchronize(); if (GPUWorkload < 100) { pProb.copyFrom(pProb_host, *this); free_device_host(pProb_host->ptr); delete[] pProb_host; } return(0); } void* bioem_cuda::malloc_device_host(size_t size) { void* ptr; checkCudaErrors(cudaHostAlloc(&ptr, size, 0)); return(ptr); } void bioem_cuda::free_device_host(void* ptr) { cudaFreeHost(ptr); } bioem* bioem_cuda_create() { int count; if (cudaGetDeviceCount(&count) != cudaSuccess) count = 0; if (count == 0) { printf("No CUDA device available, using fallback to CPU version\n"); return new bioem; } return new bioem_cuda; }