Commit e914db36 authored by David Rohr's avatar David Rohr
Browse files

implemented FFT algorithm on GPU, first version

parent 73602d4b
...@@ -28,6 +28,7 @@ INCLUDE_DIRECTORIES( include $HOME/usr/include ) ...@@ -28,6 +28,7 @@ INCLUDE_DIRECTORIES( include $HOME/usr/include )
IF(CUDA_FOUND) IF(CUDA_FOUND)
ADD_DEFINITIONS(-DWITH_CUDA) ADD_DEFINITIONS(-DWITH_CUDA)
CUDA_ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp bioem_cuda.cu ) CUDA_ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp bioem_cuda.cu )
CUDA_ADD_CUFFT_TO_TARGET(bioEM)
ELSE() ELSE()
ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp ) ADD_EXECUTABLE( bioEM bioem.cpp main.cpp map.cpp model.cpp param.cpp cmodules/timer.cpp )
ENDIF() ENDIF()
......
...@@ -263,7 +263,7 @@ int bioem::run() ...@@ -263,7 +263,7 @@ int bioem::run()
/*** Calculating convolutions of projection map and crosscorrelations ***/ /*** Calculating convolutions of projection map and crosscorrelations ***/
timer.ResetStart(); timer.ResetStart();
createConvolutedProjectionMap(iProjectionOut,iConv,proj_mapFFT,conv_map,conv_mapFFT,sumCONV,sumsquareCONV); createConvolutedProjectionMap(iProjectionOut, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
printf("Time Convolution %d %d: %f\n", iProjectionOut, iConv, timer.GetCurrentElapsedTime()); printf("Time Convolution %d %d: %f\n", iProjectionOut, iConv, timer.GetCurrentElapsedTime());
/***************************************************************************************/ /***************************************************************************************/
...@@ -348,10 +348,10 @@ int bioem::run() ...@@ -348,10 +348,10 @@ int bioem::run()
param.refCTF =NULL; param.refCTF =NULL;
} }
if(RefMap.RefMapFFT) if(RefMap.RefMapsFFT)
{ {
delete[] RefMap.RefMapFFT; delete[] RefMap.RefMapsFFT;
RefMap.RefMapFFT = NULL; RefMap.RefMapsFFT = NULL;
} }
return(0); return(0);
} }
...@@ -375,7 +375,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m ...@@ -375,7 +375,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m
for (int iRefMap = iStart; iRefMap < iEnd; iRefMap ++) for (int iRefMap = iStart; iRefMap < iEnd; iRefMap ++)
{ {
calculateCCFFT(iRefMap,iProjectionOut, iConv, sumC,sumsquareC, localmultFFT, localCCT,lCC); calculateCCFFT(iRefMap,iProjectionOut, iConv, sumC, sumsquareC, localmultFFT, localCCT,lCC);
} }
myfftw_free(localCCT); myfftw_free(localCCT);
myfftw_free(lCC); myfftw_free(lCC);
...@@ -392,82 +392,20 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m ...@@ -392,82 +392,20 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m
return(0); return(0);
} }
/////////////NEW ROUTINE //////////////// inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC)
inline int bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC)
{ {
const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.RefMapSize];
for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{ {
localCCT[i][0] = localConvFFT[i][0] * RefMap.RefMapFFT[iRefMap].cpoints[i][0] + localConvFFT[i][1] * RefMap.RefMapFFT[iRefMap].cpoints[i][1]; localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1];
localCCT[i][1] = localConvFFT[i][1] * RefMap.RefMapFFT[iRefMap].cpoints[i][0] - localConvFFT[i][0] * RefMap.RefMapFFT[iRefMap].cpoints[i][1]; localCCT[i][1] = localConvFFT[i][1] * RefMapFFT[i][0] - localConvFFT[i][0] * RefMapFFT[i][1];
} }
myfftw_execute_dft_c2r(param.fft_plan_c2r_backward,localCCT,lCC); myfftw_execute_dft_c2r(param.fft_plan_c2r_backward,localCCT,lCC);
// Storing CORRELATIONS FOR CORRESPONDING DISPLACEMENTS & Normalizing after Backward FFT doRefMapFFT(iRefMap, iOrient, iConv, lCC, sumC, sumsquareC, pProb, param.param_device, RefMap);
for (int cent_x = 0; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = 0; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels), cent_x, cent_y);
}
for (int cent_y = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_y < param.param_device.NumberPixels; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), cent_x, param.param_device.NumberPixels-cent_y);
}
}
for (int cent_x = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_x < param.param_device.NumberPixels; cent_x=cent_x+param.param_device.GridSpaceCenter)
{
for (int cent_y = 0; cent_y < param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), param.param_device.NumberPixels-cent_x, cent_y);
}
for (int cent_y = param.param_device.NumberPixels-param.param_device.maxDisplaceCenter; cent_y <= param.param_device.NumberPixels; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.param_device.NumberPixels+cent_y]/ (myfloat_t) (param.param_device.NumberPixels*param.param_device.NumberPixels), param.param_device.NumberPixels-cent_x, param.param_device.NumberPixels-cent_y);
}
}
return (0);
} }
inline int bioem::calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy)
{
/********************************************************/
/*********** Calculates the BioEM probability ***********/
/********************************************************/
const myfloat_t logpro = calc_logpro(param.param_device, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
//update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb);
//GCC is too stupid to inline properly, so the code is copied here
if(pProb[iRefMap].Constoadd < logpro)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro;
}
pProb[iRefMap].Total += exp(logpro - pProb[iRefMap].Constoadd);
if(pProb[iRefMap].ConstAngle[iOrient] < logpro)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro;
}
pProb[iRefMap].forAngles[iOrient] += exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
if(pProb[iRefMap].max_prob < logpro)
{
pProb[iRefMap].max_prob = logpro;
pProb[iRefMap].max_prob_cent_x = disx;
pProb[iRefMap].max_prob_cent_y = disy;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
}
return (0);
}
int bioem::createProjection(int iMap,mycomplex_t* mapFFT) int bioem::createProjection(int iMap,mycomplex_t* mapFFT)
{ {
/**************************************************************************************/ /**************************************************************************************/
...@@ -501,7 +439,6 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT) ...@@ -501,7 +439,6 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT)
rotmat[2][1]=-sin(beta)*cos(alpha); rotmat[2][1]=-sin(beta)*cos(alpha);
rotmat[2][2]=cos(beta); rotmat[2][2]=cos(beta);
for(int n=0; n< Model.nPointsModel; n++) for(int n=0; n< Model.nPointsModel; n++)
{ {
RotatedPointsModel[n].pos[0]=0.0; RotatedPointsModel[n].pos[0]=0.0;
...@@ -571,11 +508,12 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b ...@@ -571,11 +508,12 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b
/**** Multiplying FFTmap with corresponding kernel ****/ /**** Multiplying FFTmap with corresponding kernel ****/
const mycomplex_t* refCTF = &param.refCTF[iConv * param.RefMapSize];
for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{ {
localmultFFT[i][0] = lproj[i][0] * param.refCTF[iConv].cpoints[i][0] + lproj[i][1] * param.refCTF[iConv].cpoints[i][1]; localmultFFT[i][0] = lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1];
localmultFFT[i][1] = lproj[i][1] * param.refCTF[iConv].cpoints[i][0] - lproj[i][0] * param.refCTF[iConv].cpoints[i][1]; localmultFFT[i][1] = lproj[i][1] * refCTF[i][0] - lproj[i][0] * refCTF[i][1];
// cout << "GG " << i << " " << j << " " << param.refCTF[iConv].cpoints[i][0] << " " <<param.refCTF[iConv].cpoints[i][1] <<" " <<lproj[i][0] <<" " <<lproj[i][1] << "\n"; // cout << "GG " << i << " " << j << " " << refCTF[i][0] << " " << refCTF[i][1] <<" " <<lproj[i][0] <<" " <<lproj[i][1] << "\n";
} }
//FFTW_C2R will destroy the input array, so we have to work on a copy here //FFTW_C2R will destroy the input array, so we have to work on a copy here
......
...@@ -70,6 +70,66 @@ __device__ static inline myfloat_t calc_logpro(const bioem_param_device& param, ...@@ -70,6 +70,66 @@ __device__ static inline myfloat_t calc_logpro(const bioem_param_device& param,
return(logpro); return(logpro);
} }
__device__ static inline void calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy, bioem_Probability* pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
{
/********************************************************/
/*********** Calculates the BioEM probability ***********/
/********************************************************/
const myfloat_t logpro = calc_logpro(param, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
//update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb);
//GCC is too stupid to inline properly, so the code is copied here
if(pProb[iRefMap].Constoadd < logpro)
{
pProb[iRefMap].Total = pProb[iRefMap].Total * exp(-logpro + pProb[iRefMap].Constoadd);
pProb[iRefMap].Constoadd = logpro;
}
pProb[iRefMap].Total += exp(logpro - pProb[iRefMap].Constoadd);
if(pProb[iRefMap].ConstAngle[iOrient] < logpro)
{
pProb[iRefMap].forAngles[iOrient] = pProb[iRefMap].forAngles[iOrient] * exp(-logpro + pProb[iRefMap].ConstAngle[iOrient]);
pProb[iRefMap].ConstAngle[iOrient] = logpro;
}
pProb[iRefMap].forAngles[iOrient] += exp(logpro - pProb[iRefMap].ConstAngle[iOrient]);
if(pProb[iRefMap].max_prob < logpro)
{
pProb[iRefMap].max_prob = logpro;
pProb[iRefMap].max_prob_cent_x = disx;
pProb[iRefMap].max_prob_cent_y = disy;
pProb[iRefMap].max_prob_orient = iOrient;
pProb[iRefMap].max_prob_conv = iConv;
}
}
__device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
{
for (int cent_x = 0; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y <= param.maxDisplaceCenter; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels-param.maxDisplaceCenter; cent_y < param.NumberPixels; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), cent_x, param.NumberPixels-cent_y, pProb, param, RefMap);
}
}
for (int cent_x = param.NumberPixels-param.maxDisplaceCenter; cent_x < param.NumberPixels; cent_x=cent_x+param.GridSpaceCenter)
{
for (int cent_y = 0; cent_y < param.maxDisplaceCenter; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), param.NumberPixels-cent_x, cent_y, pProb, param, RefMap);
}
for (int cent_y = param.NumberPixels-param.maxDisplaceCenter; cent_y <= param.NumberPixels; cent_y=cent_y+param.GridSpaceCenter)
{
calProb(iRefMap, iOrient, iConv, sumC, sumsquareC, (myfloat_t) lCC[cent_x*param.NumberPixels+cent_y]/ (myfloat_t) (param.NumberPixels*param.NumberPixels), param.NumberPixels-cent_x, param.NumberPixels-cent_y, pProb, param, RefMap);
}
}
}
template <int GPUAlgo, class RefT> template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true) const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
......
...@@ -12,13 +12,13 @@ using namespace std; ...@@ -12,13 +12,13 @@ using namespace std;
//__constant__ bioem_map gConvMap; //__constant__ bioem_map gConvMap;
static inline void checkCudaErrors(cudaError_t error) #define checkCudaErrors(error) \
{ { \
if (error != cudaSuccess) if ((error) != cudaSuccess) \
{ { \
printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error)); printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \
exit(1); exit(1); \
} } \
} }
bioem_cuda::bioem_cuda() bioem_cuda::bioem_cuda()
...@@ -52,6 +52,15 @@ __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, ...@@ -52,6 +52,15 @@ __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv,
} }
} }
__global__ void cudaZeroMem(void* ptr, size_t size)
{
int* myptr = (int*) ptr;
int mysize = size / sizeof(int);
int myid = myBlockDimX * myBlockIdxX + myThreadIdxX;
int mygrid = myBlockDimX * myGridDimX;
for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0;
}
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
{ {
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
...@@ -68,6 +77,26 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon ...@@ -68,6 +77,26 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
} }
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps)
{
if (myBlockIdxX >= NumberMaps) return;
const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize];
mycuComplex_t* myout = &out[myBlockIdxX * MapSize];
for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX)
{
myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1];
myout[i].y = convmap[i][1] * myin[i][0] - convmap[i][0] * myin[i][1];
}
}
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
const myfloat_t* mylCC = &lCC[iRefMap * param.NumberPixels * param.NumberPixels];
if (iRefMap >= maxRef) return;
doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap);
}
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));} static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32) #if defined(_WIN32)
...@@ -93,53 +122,70 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -93,53 +122,70 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
{ {
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
} }
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts if (FFTAlgo)
{ {
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
if (!IsPowerOf2(nShifts)) multComplexMap<<<maxRef, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, maxRef);
cudaZeroMem<<<32, 256>>>(pFFTtmp, maxRef * sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
if (mycufftExecC2R(plan, pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{ {
cout << "Invalid number of displacements, no power of two\n"; cout << "Error running CUFFT\n";
exit(1); exit(1);
} }
if (CUDA_THREAD_COUNT % (nShifts * nShifts)) cuDoRefMapsFFT<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, maxRef);
checkCudaErrors(cudaGetLastError());
}
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts
{ {
cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n"; const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
exit(1); if (!IsPowerOf2(nShifts))
{
cout << "Invalid number of displacements, no power of two\n";
exit(1);
}
if (CUDA_THREAD_COUNT % (nShifts * nShifts))
{
cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
exit(1);
}
if (nShifts > CUDA_MAX_SHIFT_REDUCE)
{
cout << "Too many displacements for CUDA reduction\n";
exit(1);
}
const int nShiftBits = ilog2(nShifts);
size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
{
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
}
} }
if (nShifts > CUDA_MAX_SHIFT_REDUCE) else if (GPUAlgo == 1) //Split shifts in multiple kernels
{ {
cout << "Too many displacements for CUDA reduction\n"; for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
exit(1); {
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
{
compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
}
} }
const int nShiftBits = ilog2(nShifts); else if (GPUAlgo == 0) //All shifts in one kernel
size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks)
{ {
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef); compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
} }
} else
else if (GPUAlgo == 1) //Split shifts in multiple kernels
{
for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
{ {
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter) cout << "Invalid GPU Algorithm selected\n";
{ exit(1);
compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
} }
} }
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
}
else
{
cout << "Invalid GPU Algorithm selected\n";
exit(1);
}
if (GPUWorkload < 100) if (GPUWorkload < 100)
{ {
bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef); bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
...@@ -159,6 +205,8 @@ int bioem_cuda::deviceInit() ...@@ -159,6 +205,8 @@ int bioem_cuda::deviceInit()
{ {
deviceExit(); deviceExit();
if (FFTAlgo) GPUAlgo = 2;
checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cudaStreamCreate(&cudaStream));
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap))); checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
...@@ -169,6 +217,32 @@ int bioem_cuda::deviceInit() ...@@ -169,6 +217,32 @@ int bioem_cuda::deviceInit()
} }
pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device; pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
if (FFTAlgo)
{
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp, RefMap.ntotRefMap * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2));
cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice);
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, RefMap.ntotRefMap) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan, cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
if (GPUAlgo == 0 || GPUAlgo == 1) if (GPUAlgo == 0 || GPUAlgo == 1)
{ {
bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap); bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
...@@ -179,6 +253,7 @@ int bioem_cuda::deviceInit() ...@@ -179,6 +253,7 @@ int bioem_cuda::deviceInit()
{ {
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
} }
deviceInitialized = 1; deviceInitialized = 1;
return(0); return(0);
} }
...@@ -195,6 +270,14 @@ int bioem_cuda::deviceExit() ...@@ -195,6 +270,14 @@ int bioem_cuda::deviceExit()
cudaEventDestroy(cudaEvent[i]); cudaEventDestroy(cudaEvent[i]);
cudaFree(pConvMap_device); cudaFree(pConvMap_device);
} }
if (FFTAlgo)
{
cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT);
cudaFree(pFFTtmp);
cudaFree(pFFTtmp2);
cufftDestroy(plan);
}
cudaThreadExit(); cudaThreadExit();
deviceInitialized = 0; deviceInitialized = 0;
......
...@@ -27,8 +27,7 @@ public: ...@@ -27,8 +27,7 @@ public:
int createProjection(int iMap, mycomplex_t* map); int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare); int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare);
int calProb(int iRefMap,int iOrient, int iConv,myfloat_t sumC,myfloat_t sumsquareC, float value, int disx, int disy); void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC);
int calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC);
bioem_Probability* pProb; bioem_Probability* pProb;
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define BIOEM_CUDA_INTERNAL_H #define BIOEM_CUDA_INTERNAL_H
#include <cuda.h> #include <cuda.h>
#include <cufft.h>
//Hack to make nvcc compiler accept fftw.h, float128 is not used anyway //Hack to make nvcc compiler accept fftw.h, float128 is not used anyway
#define __float128 double #define __float128 double
...@@ -33,6 +34,12 @@ protected: ...@@ -33,6 +34,12 @@ protected:
bioem_Probability* pProb_device; bioem_Probability* pProb_device;
bioem_map* pConvMap_device[2]; bioem_map* pConvMap_device[2];