Commit 4f312113 authored by David Rohr's avatar David Rohr

Implement combined GPU/CPU processing via GPUWORKLOAD environment variable

parent c9754f9e
......@@ -347,10 +347,10 @@ int bioem::run()
return(0);
}
int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
{
#pragma omp parallel for
for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap ++)
for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
{
compareRefMapShifted<-1>(iRefMap,iProjectionOut,iConv,conv_map, pProb, param.param_device, RefMap);
}
......
......@@ -13,7 +13,7 @@
template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0)
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
{
/**************************************************************************************/
/********************** Calculating BioEM Probability ********************************/
......@@ -31,7 +31,8 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
myfloat_t crossproMapConv=0.0;
#endif
/****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/
if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap)
myfloat_t logpro;
if (GPUAlgo != 2 || threadActive)
{
int iStart, jStart, iEnd, jEnd;
if (cent_x < 0)
......@@ -96,19 +97,23 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
sumsquare = _mm_cvtss_f32(sumsquare_v);
crossproMapConv = _mm_cvtss_f32(cross_v);
#endif
}
/********** Calculating elements in BioEM Probability formula ********/
// Related to Reference calculated Projection
const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
/********** Calculating elements in BioEM Probability formula ********/
// Related to Reference calculated Projection
const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
// Products of different cross-correlations (first element in formula)
const myfloat_t firstele = param.Ntotpi * (RefMap.sumsquare_RefMap[iRefMap] * sumsquare-crossproMapConv * crossproMapConv) +
2 * RefMap.sum_RefMap[iRefMap] * sum * crossproMapConv - RefMap.sumsquare_RefMap[iRefMap] * sum * sum - RefMap.sum_RefMap[iRefMap] * RefMap.sum_RefMap[iRefMap] * sumsquare;
// Products of different cross-correlations (first element in formula)
const myfloat_t firstele = param.Ntotpi * (RefMap.sumsquare_RefMap[iRefMap] * sumsquare-crossproMapConv * crossproMapConv) +
2 * RefMap.sum_RefMap[iRefMap] * sum * crossproMapConv - RefMap.sumsquare_RefMap[iRefMap] * sum * sum - RefMap.sum_RefMap[iRefMap] * RefMap.sum_RefMap[iRefMap] * sumsquare;
//******* Calculating log of Prob*********/
// As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
const myfloat_t logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
//******* Calculating log of Prob*********/
// As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
}
else
{
logpro = 0;
}
#ifdef BIOEM_GPUCODE
if (GPUAlgo == 2)
......
......@@ -25,7 +25,8 @@ bioem_cuda::bioem_cuda()
{
deviceInitialized = 0;
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC"));
GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
}
bioem_cuda::~bioem_cuda()
......@@ -33,25 +34,25 @@ bioem_cuda::~bioem_cuda()
deviceExit();
}
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap)
if (iRefMap < maxRef)
{
compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
}
}
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < RefMap->ntotRefMap)
if (iRefMap < maxRef)
{
compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
}
}
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
{
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
const int iRefMap = myid >> (nShiftBits << 1);
......@@ -62,7 +63,9 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
}
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
......@@ -78,8 +81,13 @@ static inline int ilog2 (int value)
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
{
if (startMap)
{
cout << "Error startMap not implemented for GPU Code\n";
exit(1);
}
//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
if (GPUAsync)
{
......@@ -106,11 +114,11 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
exit(1);
}
const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
{
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
}
}
else if (GPUAlgo == 1) //Split shifts in multiple kernels
......@@ -119,23 +127,27 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
{
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
}
}
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod);
compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
}
else
{
cout << "Invalid GPU Algorithm selected\n";
exit(1);
}
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iProjectionOut, iConv, conv_map, maxRef);
}
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
}
}
else
{
checkCudaErrors(cudaStreamSynchronize(cudaStream));
......@@ -191,15 +203,16 @@ int bioem_cuda::deviceExit()
int bioem_cuda::deviceStartRun()
{
maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
return(0);
}
int bioem_cuda::deviceFinishRun()
{
if (GPUAsync) cudaStreamSynchronize(cudaStream);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
return(0);
}
......
......@@ -23,7 +23,7 @@ public:
int doProjections(int iMap);
int createConvolutedProjectionMap(int iOreint,int iMap, mycomplex_t* lproj,bioem_map& Mapconv);
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map);
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap = 0);
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare);
......
......@@ -11,7 +11,7 @@ public:
bioem_cuda();
virtual ~bioem_cuda();
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map);
virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap = 0);
protected:
virtual int deviceInit();
......@@ -28,8 +28,11 @@ protected:
bioem_Probability* pProb_device;
bioem_map* pConvMap_device[2];
int GPUAlgo;
int GPUAsync;
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
int maxRef;
};
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment