#define BIOEM_GPUCODE #if defined(_WIN32) #include #endif #include using namespace std; #include "bioem_cuda_internal.h" #include "bioem_algorithm.h" #define checkCudaErrors(error) \ { \ if ((error) != cudaSuccess) \ { \ printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \ exit(1); \ } \ } bioem_cuda::bioem_cuda() { deviceInitialized = 0; GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC")); GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); } bioem_cuda::~bioem_cuda() { deviceExit(); } __global__ void compareRefMap_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { compareRefMap<0>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap, cent_x, cent_y); } } __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { compareRefMapShifted<1>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap); } } __global__ void cudaZeroMem(void* ptr, size_t size) { int* myptr = (int*) ptr; int mysize = size / sizeof(int); int myid = myBlockDimX * myBlockIdxX + myThreadIdxX; int mygrid = myBlockDimX * myGridDimX; for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0; } __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) { const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const int iRefMap = myid >> (nShiftBits << 1); const int myRef = myThreadIdxX >> (nShiftBits << 1); const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1); const int myShiftIdy = myid & (nShifts - 1); const int myShift = myid & (nShifts * nShifts - 1); const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter; const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter; const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef; compareRefMap<2>(iRefMap,iOrient,iConv, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); } __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset) { if (myBlockIdxX >= NumberMaps) return; const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize + Offset]; mycuComplex_t* myout = &out[(myBlockIdxX * MapSize)]; for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX) { myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1]; myout[i].y = convmap[i][1] * myin[i][0] - convmap[i][0] * myin[i][1]; } } __global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset; const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels]; if (iRefMap >= maxRef) return; doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, RefMap); } template static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));} #if defined(_WIN32) static inline int ilog2 (int value) { DWORD index; _BitScanReverse (&index, value); return(value); } #else static inline int ilog2(int value) {return 31 - __builtin_clz(value);} #endif int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) { if (startMap) { cout << "Error startMap not implemented for GPU Code\n"; exit(1); } if (GPUAsync) { checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); } if (FFTAlgo) { checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream)); for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE) { const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i); multComplexMap<<>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i); if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS) { cout << "Error running CUFFT\n"; exit(1); } cuDoRefMapsFFT<<>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i); } checkCudaErrors(cudaGetLastError()); } else { checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream)); if (GPUAlgo == 2) //Loop over shifts { const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; if (!IsPowerOf2(nShifts)) { cout << "Invalid number of displacements, no power of two\n"; exit(1); } if (CUDA_THREAD_COUNT % (nShifts * nShifts)) { cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n"; exit(1); } if (nShifts > CUDA_MAX_SHIFT_REDUCE) { cout << "Too many displacements for CUDA reduction\n"; exit(1); } const int nShiftBits = ilog2(nShifts); size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT); size_t nBlocks = CUDA_BLOCK_COUNT; for (size_t i = 0;i < totalBlocks;i += nBlocks) { compareRefMapLoopShifts_kernel <<>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef); } } else if (GPUAlgo == 1) //Split shifts in multiple kernels { for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter) { for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter) { compareRefMap_kernel <<>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef); } } } else if (GPUAlgo == 0) //All shifts in one kernel { compareRefMapShifted_kernel <<>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef); } else { cout << "Invalid GPU Algorithm selected\n"; exit(1); } } if (GPUWorkload < 100) { bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef); } if (GPUAsync) { checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream)); } else { checkCudaErrors(cudaStreamSynchronize(cudaStream)); } return(0); } int bioem_cuda::deviceInit() { deviceExit(); if (FFTAlgo) GPUAlgo = 2; gpumap = new bioem_RefMap; memcpy(gpumap, &RefMap, sizeof(bioem_RefMap)); if (FFTAlgo == 0) { checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize)); checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice)); } checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap)); checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap)); checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); gpumap->maps = maps; gpumap->sum_RefMap = sum; gpumap->sumsquare_RefMap = sumsquare; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); for (int i = 0;i < 2;i++) { checkCudaErrors(cudaEventCreate(&cudaEvent[i])); checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize)); } if (FFTAlgo) { checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.FFTMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t))); checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2)); checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice)); } if (GPUAlgo == 0 || GPUAlgo == 1) { checkCudaErrors(cudaMalloc(&pRefMap_device_Mod, sizeof(bioem_RefMap_Mod))); bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap); cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); delete RefMapGPU; } deviceInitialized = 1; return(0); } int bioem_cuda::deviceExit() { if (deviceInitialized == 0) return(0); cudaStreamDestroy(cudaStream); cudaFree(pProb_device); cudaFree(sum); cudaFree(sumsquare); for (int i = 0;i < 2;i++) { cudaEventDestroy(cudaEvent[i]); cudaFree(pConvMap_device[i]); } if (FFTAlgo) { cudaFree(pRefMapsFFT); cudaFree(pConvMapFFT); //cudaFree(pFFTtmp); cudaFree(pFFTtmp2); } else { cudaFree(maps); } if (GPUAlgo == 0 || GPUAlgo == 1) { cudaFree(pRefMap_device_Mod); } delete gpumap; cudaThreadExit(); deviceInitialized = 0; return(0); } int bioem_cuda::deviceStartRun() { maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100); cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice); if (FFTAlgo) { for (int i = 0;i < 2;i++) { int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels}; if (cufftPlanMany(&plan[i], 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS) { cout << "Error planning CUFFT\n"; exit(1); } if (cufftSetCompatibilityMode(plan[i], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS) { cout << "Error planning CUFFT compatibility\n"; exit(1); } if (cufftSetStream(plan[i], cudaStream) != CUFFT_SUCCESS) { cout << "Error setting CUFFT stream\n"; exit(1); } } } return(0); } int bioem_cuda::deviceFinishRun() { if (GPUAsync) cudaStreamSynchronize(cudaStream); cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost); if (FFTAlgo) { for (int i = 0;i < 2;i++) cufftDestroy(plan[i]); } return(0); } bioem* bioem_cuda_create() { return new bioem_cuda; }