Commit 4f312113 authored by David Rohr's avatar David Rohr
Browse files

Implement combined GPU/CPU processing via GPUWORKLOAD environment variable

parent c9754f9e
...@@ -347,10 +347,10 @@ int bioem::run() ...@@ -347,10 +347,10 @@ int bioem::run()
return(0); return(0);
} }
int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map) int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
{ {
#pragma omp parallel for #pragma omp parallel for
for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap ++) for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
{ {
compareRefMapShifted<-1>(iRefMap,iProjectionOut,iConv,conv_map, pProb, param.param_device, RefMap); compareRefMapShifted<-1>(iRefMap,iProjectionOut,iConv,conv_map, pProb, param.param_device, RefMap);
} }
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
template <int GPUAlgo, class RefT> template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0) const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
{ {
/**************************************************************************************/ /**************************************************************************************/
/********************** Calculating BioEM Probability ********************************/ /********************** Calculating BioEM Probability ********************************/
...@@ -31,7 +31,8 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -31,7 +31,8 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
myfloat_t crossproMapConv=0.0; myfloat_t crossproMapConv=0.0;
#endif #endif
/****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/ /****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/
if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap) myfloat_t logpro;
if (GPUAlgo != 2 || threadActive)
{ {
int iStart, jStart, iEnd, jEnd; int iStart, jStart, iEnd, jEnd;
if (cent_x < 0) if (cent_x < 0)
...@@ -96,19 +97,23 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -96,19 +97,23 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
sumsquare = _mm_cvtss_f32(sumsquare_v); sumsquare = _mm_cvtss_f32(sumsquare_v);
crossproMapConv = _mm_cvtss_f32(cross_v); crossproMapConv = _mm_cvtss_f32(cross_v);
#endif #endif
}
/********** Calculating elements in BioEM Probability formula ********/ /********** Calculating elements in BioEM Probability formula ********/
// Related to Reference calculated Projection // Related to Reference calculated Projection
const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum); const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
// Products of different cross-correlations (first element in formula) // Products of different cross-correlations (first element in formula)
const myfloat_t firstele = param.Ntotpi * (RefMap.sumsquare_RefMap[iRefMap] * sumsquare-crossproMapConv * crossproMapConv) + const myfloat_t firstele = param.Ntotpi * (RefMap.sumsquare_RefMap[iRefMap] * sumsquare-crossproMapConv * crossproMapConv) +
2 * RefMap.sum_RefMap[iRefMap] * sum * crossproMapConv - RefMap.sumsquare_RefMap[iRefMap] * sum * sum - RefMap.sum_RefMap[iRefMap] * RefMap.sum_RefMap[iRefMap] * sumsquare; 2 * RefMap.sum_RefMap[iRefMap] * sum * crossproMapConv - RefMap.sumsquare_RefMap[iRefMap] * sum * sum - RefMap.sum_RefMap[iRefMap] * RefMap.sum_RefMap[iRefMap] * sumsquare;
//******* Calculating log of Prob*********/ //******* Calculating log of Prob*********/
// As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1); // As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
const myfloat_t logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb); logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
}
else
{
logpro = 0;
}
#ifdef BIOEM_GPUCODE #ifdef BIOEM_GPUCODE
if (GPUAlgo == 2) if (GPUAlgo == 2)
......
...@@ -25,7 +25,8 @@ bioem_cuda::bioem_cuda() ...@@ -25,7 +25,8 @@ bioem_cuda::bioem_cuda()
{ {
deviceInitialized = 0; deviceInitialized = 0;
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC")); GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
} }
bioem_cuda::~bioem_cuda() bioem_cuda::~bioem_cuda()
...@@ -33,25 +34,25 @@ bioem_cuda::~bioem_cuda() ...@@ -33,25 +34,25 @@ bioem_cuda::~bioem_cuda()
deviceExit(); deviceExit();
} }
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y) __global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap) if (iRefMap < maxRef)
{ {
compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y); compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
} }
} }
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap) __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap) if (iRefMap < maxRef)
{ {
compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap); compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
} }
} }
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits) __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
{ {
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
const int iRefMap = myid >> (nShiftBits << 1); const int iRefMap = myid >> (nShiftBits << 1);
...@@ -62,7 +63,9 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon ...@@ -62,7 +63,9 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter; const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter; const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef); const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
} }
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
...@@ -78,8 +81,13 @@ static inline int ilog2 (int value) ...@@ -78,8 +81,13 @@ static inline int ilog2 (int value)
static inline int ilog2(int value) {return 31 - __builtin_clz(value);} static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif #endif
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map) int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
{ {
if (startMap)
{
cout << "Error startMap not implemented for GPU Code\n";
exit(1);
}
//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream); //cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
if (GPUAsync) if (GPUAsync)
{ {
...@@ -106,11 +114,11 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -106,11 +114,11 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
exit(1); exit(1);
} }
const int nShiftBits = ilog2(nShifts); const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT); size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT; size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks) for (size_t i = 0;i < totalBlocks;i += nBlocks)
{ {
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits); compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
} }
} }
else if (GPUAlgo == 1) //Split shifts in multiple kernels else if (GPUAlgo == 1) //Split shifts in multiple kernels
...@@ -119,23 +127,27 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -119,23 +127,27 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
{ {
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter) for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{ {
compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y); compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
} }
} }
} }
else if (GPUAlgo == 0) //All shifts in one kernel else if (GPUAlgo == 0) //All shifts in one kernel
{ {
compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod); compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
} }
else else
{ {
cout << "Invalid GPU Algorithm selected\n"; cout << "Invalid GPU Algorithm selected\n";
exit(1); exit(1);
} }
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iProjectionOut, iConv, conv_map, maxRef);
}
if (GPUAsync) if (GPUAsync)
{ {
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream)); checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
} }
else else
{ {
checkCudaErrors(cudaStreamSynchronize(cudaStream)); checkCudaErrors(cudaStreamSynchronize(cudaStream));
...@@ -191,15 +203,16 @@ int bioem_cuda::deviceExit() ...@@ -191,15 +203,16 @@ int bioem_cuda::deviceExit()
int bioem_cuda::deviceStartRun() int bioem_cuda::deviceStartRun()
{ {
maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice); cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
return(0); return(0);
} }
int bioem_cuda::deviceFinishRun() int bioem_cuda::deviceFinishRun()
{ {
if (GPUAsync) cudaStreamSynchronize(cudaStream); if (GPUAsync) cudaStreamSynchronize(cudaStream);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost); cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
return(0); return(0);
} }
......
...@@ -23,7 +23,7 @@ public: ...@@ -23,7 +23,7 @@ public:
int doProjections(int iMap); int doProjections(int iMap);
int createConvolutedProjectionMap(int iOreint,int iMap, mycomplex_t* lproj,bioem_map& Mapconv); int createConvolutedProjectionMap(int iOreint,int iMap, mycomplex_t* lproj,bioem_map& Mapconv);
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map); virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap = 0);
int createProjection(int iMap, mycomplex_t* map); int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare); int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare);
......
...@@ -11,7 +11,7 @@ public: ...@@ -11,7 +11,7 @@ public:
bioem_cuda(); bioem_cuda();
virtual ~bioem_cuda(); virtual ~bioem_cuda();
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map); virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap = 0);
protected: protected:
virtual int deviceInit(); virtual int deviceInit();
...@@ -28,8 +28,11 @@ protected: ...@@ -28,8 +28,11 @@ protected:
bioem_Probability* pProb_device; bioem_Probability* pProb_device;
bioem_map* pConvMap_device[2]; bioem_map* pConvMap_device[2];
int GPUAlgo; int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
int maxRef;
}; };
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment