Commit e914db36 authored by David Rohr's avatar David Rohr
Browse files

implemented FFT algorithm on GPU, first version

parent 73602d4b
......@@ -28,6 +28,7 @@ INCLUDE_DIRECTORIES( include $HOME/usr/include )
IF(CUDA_FOUND)
ADD_DEFINITIONS(-DWITH_CUDA)
CUDA_ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp bioem_cuda.cu )
CUDA_ADD_CUFFT_TO_TARGET(bioEM)
ELSE()
ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp )
ENDIF()
......
......@@ -263,7 +263,7 @@ int bioem::run()
/*** Calculating convolutions of projection map and crosscorrelations ***/
timer.ResetStart();
createConvolutedProjectionMap(iProjectionOut,iConv,proj_mapFFT,conv_map,conv_mapFFT,sumCONV,sumsquareCONV);
createConvolutedProjectionMap(iProjectionOut, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
printf("Time Convolution %d %d: %f\n", iProjectionOut, iConv, timer.GetCurrentElapsedTime());
/***************************************************************************************/
......@@ -348,10 +348,10 @@ int bioem::run()
param.refCTF =NULL;
}
if(RefMap.RefMapFFT)
if(RefMap.RefMapsFFT)
{
delete[] RefMap.RefMapFFT;
RefMap.RefMapFFT = NULL;
delete[] RefMap.RefMapsFFT;
RefMap.RefMapsFFT = NULL;
}
return(0);
}
......@@ -375,7 +375,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m
for (int iRefMap = iStart; iRefMap < iEnd; iRefMap ++)
{
calculateCCFFT(iRefMap,iProjectionOut, iConv, sumC,sumsquareC, localmultFFT, localCCT,lCC);
calculateCCFFT(iRefMap,iProjectionOut, iConv, sumC, sumsquareC, localmultFFT, localCCT,lCC);
}
myfftw_free(localCCT);
myfftw_free(lCC);
......@@ -392,82 +392,20 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m
return(0);
}
/////////////NEW ROUTINE ////////////////
inline int bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC)
inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC)
{
const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.RefMapSize];
for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{
localCCT[i][0] = localConvFFT[i][0] * RefMap.RefMapFFT[iRefMap].cpoints[i][0] + localConvFFT[i][1] * RefMap.RefMapFFT[iRefMap].cpoints[i][1];
localCCT[i][1] = localConvFFT[i][1] * RefMap.RefMapFFT[iRefMap].cpoints[i][0] - localConvFFT[i][0] * RefMap.RefMapFFT[iRefMap].cpoints[i][1];
localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1];
localCCT[i][1] = localConvFFT[i][1] * RefMapFFT[i][0] - localConvFFT[i][0] * RefMapFFT[i][1];
}
myfftw_execute_dft_c2r(param.fft_plan_c2r_backward,localCCT,lCC);
// Storing CORRELATIONS FOR CORRESPONDING DISPLACEMENTS & Normalizing after Backward FFT
for (int cent_x = 0; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = 0; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels), cent_x, cent_y);
}
for (int cent_y = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_y < param.param_device.NumberPixels; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), cent_x, param.param_device.NumberPixels-cent_y);
}
}
for (int cent_x = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_x < param.param_device.NumberPixels; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = 0; cent_y < param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), param.param_device.NumberPixels-cent_x, cent_y);
}
for (int cent_y = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_y <= param.param_device.NumberPixels; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), param.param_device.NumberPixels-cent_x, param.param_device.NumberPixels-cent_y);
}
}
return (0);
doRefMapFFT(iRefMap, iOrient, iConv, lCC, sumC, sumsquareC, pProb, param.param_device, RefMap);
}
inline int bioem::calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy)
{
/********************************************************/
/*********** Calculates the BioEM probability ***********/
/********************************************************/
const myfloat_t logpro = calc_logpro(param.param_device, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
//update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb);
//GCC is too stupid to inline properly, so the code is copied here
if(pProb[iRefMap].Constoadd < logpro)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro;
}
pProb[iRefMap].Total += exp(logpro - pProb[iRefMap].Constoadd);
if(pProb[iRefMap].ConstAngle[iOrient] < logpro)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro;
}
pProb[iRefMap].forAngles[iOrient] += exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
if(pProb[iRefMap].max_prob < logpro)
{
pProb[iRefMap].max_prob = logpro;
pProb[iRefMap].max_prob_cent_x = disx;
pProb[iRefMap].max_prob_cent_y = disy;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
}
return (0);
}
int bioem::createProjection(int iMap,mycomplex_t* mapFFT)
{
/**************************************************************************************/
......@@ -501,7 +439,6 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT)
rotmat[2][1]=-sin(beta)*cos(alpha);
rotmat[2][2]=cos(beta);
for(int n=0; n< Model.nPointsModel; n++)
{
RotatedPointsModel[n].pos[0]=0.0;
......@@ -571,11 +508,12 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b
/**** Multiplying FFTmap with corresponding kernel ****/
const mycomplex_t* refCTF = &param.refCTF[iConv * param.RefMapSize];
for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{
localmultFFT[i][0] = lproj[i][0] * param.refCTF[iConv].cpoints[i][0] + lproj[i][1] * param.refCTF[iConv].cpoints[i][1];
localmultFFT[i][1] = lproj[i][1] * param.refCTF[iConv].cpoints[i][0] - lproj[i][0] * param.refCTF[iConv].cpoints[i][1];
// cout << "GG " << i << " " << j << " " << param.refCTF[iConv].cpoints[i][0] << " " <<param.refCTF[iConv].cpoints[i][1] <<" " <<lproj[i][0] <<" " <<lproj[i][1] << "\n";
localmultFFT[i][0] = lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1];
localmultFFT[i][1] = lproj[i][1] * refCTF[i][0] - lproj[i][0] * refCTF[i][1];
// cout << "GG " << i << " " << j << " " << refCTF[i][0] << " " << refCTF[i][1] <<" " <<lproj[i][0] <<" " <<lproj[i][1] << "\n";
}
//FFTW_C2R will destroy the input array, so we have to work on a copy here
......
......@@ -70,6 +70,66 @@ __device__ static inline myfloat_t calc_logpro(const bioem_param_device& param,
return(logpro);
}
__device__ static inline void calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy, bioem_Probability* pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
{
/********************************************************/
/*********** Calculates the BioEM probability ***********/
/********************************************************/
const myfloat_t logpro = calc_logpro(param, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
//update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb);
//GCC is too stupid to inline properly, so the code is copied here
if(pProb[iRefMap].Constoadd < logpro)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro;
}
pProb[iRefMap].Total += exp(logpro - pProb[iRefMap].Constoadd);
if(pProb[iRefMap].ConstAngle[iOrient] < logpro)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro;
}
pProb[iRefMap].forAngles[iOrient] += exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
if(pProb[iRefMap].max_prob < logpro)
{
pProb[iRefMap].max_prob = logpro;
pProb[iRefMap].max_prob_cent_x = disx;
pProb[iRefMap].max_prob_cent_y = disy;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
}
}
__device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
{
for (int cent_x = 0; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y <= param.maxDisplaceCenter; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels-param.maxDisplaceCenter; cent_y < param.NumberPixels; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), cent_x, param.NumberPixels-cent_y, pProb, param, RefMap);
}
}
for (int cent_x = param.NumberPixels-param.maxDisplaceCenter; cent_x < param.NumberPixels; cent_x=cent_x+param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y < param.maxDisplaceCenter; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), param.NumberPixels-cent_x, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels-param.maxDisplaceCenter; cent_y <= param.NumberPixels; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), param.NumberPixels-cent_x, param.NumberPixels-cent_y, pProb, param, RefMap);
}
}
}
template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
......
......@@ -12,13 +12,13 @@ using namespace std;
//__constant__ bioem_map gConvMap;
static inline void checkCudaErrors(cudaError_t error)
{
if (error != cudaSuccess)
{
printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
exit(1);
}
#define checkCudaErrors(error) \
{ \
if ((error) != cudaSuccess) \
{ \
printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \
exit(1); \
} \
}
bioem_cuda::bioem_cuda()
......@@ -52,6 +52,15 @@ __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv,
}
}
__global__ void cudaZeroMem(void* ptr, size_t size)
{
int* myptr = (int*) ptr;
int mysize = size / sizeof(int);
int myid = myBlockDimX * myBlockIdxX + myThreadIdxX;
int mygrid = myBlockDimX * myGridDimX;
for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0;
}
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
{
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
......@@ -68,6 +77,26 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
}
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps)
{
if (myBlockIdxX >= NumberMaps) return;
const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize];
mycuComplex_t* myout = &out[myBlockIdxX * MapSize];
for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX)
{
myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1];
myout[i].y = convmap[i][1] * myin[i][0] - convmap[i][0] * myin[i][1];
}
}
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
const myfloat_t* mylCC = &lCC[iRefMap * param.NumberPixels * param.NumberPixels];
if (iRefMap >= maxRef) return;
doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap);
}
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
......@@ -93,53 +122,70 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
{
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
}
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts
if (FFTAlgo)
{
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
if (!IsPowerOf2(nShifts))
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
multComplexMap<<<maxRef, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, maxRef);
cudaZeroMem<<<32, 256>>>(pFFTtmp, maxRef * sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
if (mycufftExecC2R(plan, pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{
cout << "Invalid number of displacements, no power of two\n";
cout << "Error running CUFFT\n";
exit(1);
}
if (CUDA_THREAD_COUNT % (nShifts * nShifts))
cuDoRefMapsFFT<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, maxRef);
checkCudaErrors(cudaGetLastError());
}
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts
{
cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
exit(1);
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
if (!IsPowerOf2(nShifts))
{
cout << "Invalid number of displacements, no power of two\n";
exit(1);
}
if (CUDA_THREAD_COUNT % (nShifts * nShifts))
{
cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
exit(1);
}
if (nShifts > CUDA_MAX_SHIFT_REDUCE)
{
cout << "Too many displacements for CUDA reduction\n";
exit(1);
}
const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
{
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
}
}
if (nShifts > CUDA_MAX_SHIFT_REDUCE)
else if (GPUAlgo == 1) //Split shifts in multiple kernels
{
cout << "Too many displacements for CUDA reduction\n";
exit(1);
for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
}
}
const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
}
}
else if (GPUAlgo == 1) //Split shifts in multiple kernels
{
for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
else
{
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
cout << "Invalid GPU Algorithm selected\n";
exit(1);
}
}
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
}
else
{
cout << "Invalid GPU Algorithm selected\n";
exit(1);
}
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
......@@ -159,6 +205,8 @@ int bioem_cuda::deviceInit()
{
deviceExit();
if (FFTAlgo) GPUAlgo = 2;
checkCudaErrors(cudaStreamCreate(&cudaStream));
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
......@@ -169,6 +217,32 @@ int bioem_cuda::deviceInit()
}
pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
if (FFTAlgo)
{
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp, RefMap.ntotRefMap * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2));
cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice);
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, RefMap.ntotRefMap) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan, cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
if (GPUAlgo == 0 || GPUAlgo == 1)
{
bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
......@@ -179,6 +253,7 @@ int bioem_cuda::deviceInit()
{
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
}
deviceInitialized = 1;
return(0);
}
......@@ -195,6 +270,14 @@ int bioem_cuda::deviceExit()
cudaEventDestroy(cudaEvent[i]);
cudaFree(pConvMap_device);
}
if (FFTAlgo)
{
cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT);
cudaFree(pFFTtmp);
cudaFree(pFFTtmp2);
cufftDestroy(plan);
}
cudaThreadExit();
deviceInitialized = 0;
......
......@@ -27,8 +27,7 @@ public:
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare);
int calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy);
int calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC);
void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC);
bioem_Probability* pProb;
......
......@@ -2,6 +2,7 @@
#define BIOEM_CUDA_INTERNAL_H
#include <cuda.h>
#include <cufft.h>
//Hack to make nvcc compiler accept fftw.h, float128 is not used anyway
#define __float128 double
......@@ -33,6 +34,12 @@ protected:
bioem_Probability* pProb_device;
bioem_map* pConvMap_device[2];
mycomplex_t* pRefMapsFFT;
mycomplex_t* pConvMapFFT;
mycuComplex_t* pFFTtmp2;
myfloat_t* pFFTtmp;
cufftHandle plan;
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
......
......@@ -16,6 +16,8 @@ typedef float myfloat_t;
#define myfftw_plan_dft_r2c_2d fftwf_plan_dft_r2c_2d
#define myfftw_plan_dft_c2r_2d fftwf_plan_dft_c2r_2d
#define myfftw_plan fftwf_plan
#define mycufftExecC2R cufftExecC2R
#define mycuComplex_t cuComplex
#else
typedef double myfloat_t;
#define myfftw_malloc fftw_malloc
......@@ -29,6 +31,8 @@ typedef double myfloat_t;
#define myfftw_plan_dft_r2c_2d fftw_plan_dft_r2c_2d
#define myfftw_plan_dft_c2r_2d fftw_plan_dft_c2r_2d
#define myfftw_plan fftw_plan
#define mycufftExecC2R cufftExecZ2D
#define mycuComplex_t cuDoubleComplex
#endif
typedef myfloat_t mycomplex_t[2];
......@@ -36,10 +40,8 @@ typedef myfloat_t mycomplex_t[2];
#define BIOEM_MAP_SIZE_X 224
#define BIOEM_MAP_SIZE_Y 224
#define BIOEM_MODEL_SIZE 120000
#define BIOEM_MAX_MAPS 12000
#define MAX_REF_CTF 200
#define BIOEM_MAX_MAPS 4000
#define MAX_ORIENT 20000
#define MAX_DISPLACE 224
struct myfloat3_t
{
......@@ -53,6 +55,7 @@ struct myfloat3_t
#define myBlockDimY blockDim.y
#define myBlockIdxX blockIdx.x
#define myBlockIdxY blockIdx.y
#define myGridDimX gridDim.x
#else
#define __device__
#define __host__
......
......@@ -13,27 +13,17 @@ public:
myfloat_t points[BIOEM_MAP_SIZE_X][BIOEM_MAP_SIZE_Y];
};
class bioem_map_forFFT
{
public:
mycomplex_t cpoints[BIOEM_MAP_SIZE_X*BIOEM_MAP_SIZE_Y];
};
class bioem_convolutedMap
{
public:
bioem_map conv[MAX_REF_CTF];
myfloat_t sum_convMap[MAX_REF_CTF];
myfloat_t sumsquare_convMap[MAX_REF_CTF];
myfloat_t ForLogProbfromConv[MAX_REF_CTF];
};
class bioem_RefMap
{
public:
bioem_RefMap()
{
RefMapsFFT = NULL;
}
int readRefMaps(bioem_param& param);
int PreCalculateMapsFFT(bioem_param& param);
bioem_map_forFFT* RefMapFFT;
mycomplex_t* RefMapsFFT;
const char* filemap;
int ntotRefMap;
......@@ -83,14 +73,6 @@ public:
}
};
class bioem_crossCor
{
public:
int disx[MAX_DISPLACE];
int disy[MAX_DISPLACE];
myfloat_t value[MAX_DISPLACE];
};
class bioem_Probability
{
public:
......
......@@ -7,8 +7,6 @@
#include <math.h>
#include <fftw3.h>
class bioem_map_forFFT;
class bioem_param_device
{
public:
......@@ -34,7 +32,9 @@ public:
bioem_param_device param_device;
bioem_map_forFFT* refCTF;
int RefMapSize;
mycomplex_t* refCTF;
myfloat3_t* CtfParam;
// File names
const char* fileinput;
......@@ -66,7 +66,6 @@ public: