Commit a30096fc authored by David Rohr's avatar David Rohr
Browse files

start work to remove BIOEM_MAX_MAPS constant

parent 81fab4b2
...@@ -57,8 +57,8 @@ int bioem::configure(int ac, char* av[]) ...@@ -57,8 +57,8 @@ int bioem::configure(int ac, char* av[])
std::string infile,modelfile,mapfile; std::string infile,modelfile,mapfile;
Model.readPDB=false; Model.readPDB=false;
param.writeAngles=false; param.writeAngles=false;
RefMap.dumpMap = false; param.dumpMap = false;
RefMap.loadMap = false; param.loadMap = false;
/*************************************************************************************/ /*************************************************************************************/
cout << " ++++++++++++ FROM COMMAND LINE +++++++++++\n\n"; cout << " ++++++++++++ FROM COMMAND LINE +++++++++++\n\n";
...@@ -93,12 +93,12 @@ int bioem::configure(int ac, char* av[]) ...@@ -93,12 +93,12 @@ int bioem::configure(int ac, char* av[])
if((ac < 6)) { if((ac < 6)) {
std::cout << desc << std::endl; std::cout << desc << std::endl;
return 0; return 1;
} }
if (vm.count("help")) { if (vm.count("help")) {
cout << "Usage: options_description [options]\n"; cout << "Usage: options_description [options]\n";
cout << desc; cout << desc;
return 0; return 1;
} }
if (vm.count("Inputfile")) if (vm.count("Inputfile"))
...@@ -123,13 +123,13 @@ int bioem::configure(int ac, char* av[]) ...@@ -123,13 +123,13 @@ int bioem::configure(int ac, char* av[])
if (vm.count("DumpMaps")) if (vm.count("DumpMaps"))
{ {
cout << "Dumping Maps after reading from file.\n"; cout << "Dumping Maps after reading from file.\n";
RefMap.dumpMap = true; param.dumpMap = true;
} }
if (vm.count("LoadMapDump")) if (vm.count("LoadMapDump"))
{ {
cout << "Loading Map dump.\n"; cout << "Loading Map dump.\n";
RefMap.loadMap = true; param.loadMap = true;
} }
if (vm.count("Particlesfile")) if (vm.count("Particlesfile"))
...@@ -158,7 +158,7 @@ int bioem::configure(int ac, char* av[]) ...@@ -158,7 +158,7 @@ int bioem::configure(int ac, char* av[])
/********************* Reading Particle Maps Input **********************/ /********************* Reading Particle Maps Input **********************/
/********* HERE: PROBLEM if maps dont fit on the memory!! ***************/ /********* HERE: PROBLEM if maps dont fit on the memory!! ***************/
// copying mapfile to ref map class // copying mapfile to ref map class
RefMap.filemap = mapfile.c_str(); param.filemap = mapfile.c_str();
RefMap.readRefMaps(param); RefMap.readRefMaps(param);
/****************** Precalculating Necessary Stuff *********************/ /****************** Precalculating Necessary Stuff *********************/
...@@ -189,7 +189,7 @@ int bioem::precalculate() ...@@ -189,7 +189,7 @@ int bioem::precalculate()
//Precalculating cross-correlations of maps //Precalculating cross-correlations of maps
for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap ; iRefMap++) for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap ; iRefMap++)
{ {
calcross_cor(RefMap.Ref[iRefMap],sum,sumsquare); calcross_cor(RefMap.getmap(iRefMap),sum,sumsquare);
//Storing Crosscorrelations in Map class //Storing Crosscorrelations in Map class
RefMap.sum_RefMap[iRefMap]=sum; RefMap.sum_RefMap[iRefMap]=sum;
RefMap.sumsquare_RefMap[iRefMap]=sumsquare; RefMap.sumsquare_RefMap[iRefMap]=sumsquare;
...@@ -235,7 +235,7 @@ int bioem::run() ...@@ -235,7 +235,7 @@ int bioem::run()
/*** Declaring Private variables for each thread *****/ /*** Declaring Private variables for each thread *****/
mycomplex_t* proj_mapFFT; mycomplex_t* proj_mapFFT;
bioem_map conv_map; myfloat_t* conv_map = new myfloat_t[param.param_device.NumberPixels * param.param_device.NumberPixels];
mycomplex_t* conv_mapFFT; mycomplex_t* conv_mapFFT;
myfloat_t sumCONV,sumsquareCONV; myfloat_t sumCONV,sumsquareCONV;
...@@ -285,6 +285,7 @@ int bioem::run() ...@@ -285,6 +285,7 @@ int bioem::run()
//deallocating fftw_complex vector //deallocating fftw_complex vector
myfftw_free(proj_mapFFT); myfftw_free(proj_mapFFT);
myfftw_free(conv_mapFFT); myfftw_free(conv_mapFFT);
delete[] conv_map;
deviceFinishRun(); deviceFinishRun();
...@@ -348,15 +349,11 @@ int bioem::run() ...@@ -348,15 +349,11 @@ int bioem::run()
param.refCTF =NULL; param.refCTF =NULL;
} }
if(RefMap.RefMapsFFT) RefMap.freePointers();
{
delete[] RefMap.RefMapsFFT;
RefMap.RefMapsFFT = NULL;
}
return(0); return(0);
} }
int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) int bioem::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
{ {
if (FFTAlgo) if (FFTAlgo)
{ {
...@@ -394,7 +391,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m ...@@ -394,7 +391,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m
inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC) inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC)
{ {
const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.RefMapSize]; const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.FFTMapSize];
for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{ {
localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1]; localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1];
...@@ -493,7 +490,7 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT) ...@@ -493,7 +490,7 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT)
return(0); return(0);
} }
int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,bioem_map& Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC) int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,myfloat_t* Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC)
{ {
/**************************************************************************************/ /**************************************************************************************/
/**** BioEM Create Convoluted Projection Map routine, multiplies in Fourier ********** /**** BioEM Create Convoluted Projection Map routine, multiplies in Fourier **********
...@@ -508,7 +505,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b ...@@ -508,7 +505,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b
/**** Multiplying FFTmap with corresponding kernel ****/ /**** Multiplying FFTmap with corresponding kernel ****/
const mycomplex_t* refCTF = &param.refCTF[iConv * param.RefMapSize]; const mycomplex_t* refCTF = &param.refCTF[iConv * param.FFTMapSize];
for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++)
{ {
localmultFFT[i][0] = lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1]; localmultFFT[i][0] = lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1];
...@@ -527,7 +524,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b ...@@ -527,7 +524,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b
{ {
for(int j=0; j < param.param_device.NumberPixels ; j++ ) for(int j=0; j < param.param_device.NumberPixels ; j++ )
{ {
Mapconv.points[i][j]=localconvFFT[i*param.param_device.NumberPixels+j]; Mapconv[i*param.param_device.NumberPixels+j] = localconvFFT[i*param.param_device.NumberPixels+j];
} }
} }
...@@ -553,7 +550,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b ...@@ -553,7 +550,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b
return(0); return(0);
} }
int bioem::calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare) int bioem::calcross_cor(myfloat_t* localmap,myfloat_t& sum,myfloat_t& sumsquare)
{ {
/*********************** Routine to calculate Cross correlations***********************/ /*********************** Routine to calculate Cross correlations***********************/
...@@ -564,9 +561,9 @@ int bioem::calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare) ...@@ -564,9 +561,9 @@ int bioem::calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare)
for (int j = 0; j < param.param_device.NumberPixels; j++) for (int j = 0; j < param.param_device.NumberPixels; j++)
{ {
// Calculate Sum of pixels // Calculate Sum of pixels
sum += localmap.points[i][j]; sum += localmap[i*param.param_device.NumberPixels+j];
// Calculate Sum of pixels squared // Calculate Sum of pixels squared
sumsquare += localmap.points[i][j]*localmap.points[i][j]; sumsquare += localmap[i*param.param_device.NumberPixels+j] * localmap[i*param.param_device.NumberPixels+j];
} }
} }
return(0); return(0);
......
...@@ -131,16 +131,16 @@ __device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, ...@@ -131,16 +131,16 @@ __device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient,
} }
template <int GPUAlgo, class RefT> template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true) const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
{ {
/**************************************************************************************/
/********************** Calculating BioEM Probability ********************************/ // ********************** Calculating BioEM Probability ********************************
/************************* Loop of center displacement here ***************************/ // ************************* Loop of center displacement here ***************************
// Taking into account the center displacement // Taking into account the center displacement
/*** Inizialzing crosscorrelations of calculated projected convolutions ***/ // Inizialzing crosscorrelations of calculated projected convolutions
#ifdef SSECODE #ifdef SSECODE
myfloat_t sum, sumsquare, crossproMapConv; myfloat_t sum, sumsquare, crossproMapConv;
__m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2; __m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2;
...@@ -149,7 +149,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -149,7 +149,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
myfloat_t sumsquare=0.0; myfloat_t sumsquare=0.0;
myfloat_t crossproMapConv=0.0; myfloat_t crossproMapConv=0.0;
#endif #endif
/****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/ // Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map
myfloat_t logpro; myfloat_t logpro;
if (GPUAlgo != 2 || threadActive) if (GPUAlgo != 2 || threadActive)
{ {
...@@ -195,7 +195,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -195,7 +195,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
#else #else
for (int j = jStart; j < jEnd; j += 1) for (int j = jStart; j < jEnd; j += 1)
{ {
const myfloat_t pointMap = Mapconv.points[i + cent_x][j + cent_y]; const myfloat_t pointMap = Mapconv[(i + cent_x) * param.NumberPixels + j + cent_y];
const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j); const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
crossproMapConv += pointMap * pointRefMap; crossproMapConv += pointMap * pointRefMap;
// Crosscorrelation of calculated displaced map // Crosscorrelation of calculated displaced map
...@@ -217,7 +217,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -217,7 +217,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
crossproMapConv = _mm_cvtss_f32(cross_v); crossproMapConv = _mm_cvtss_f32(cross_v);
#endif #endif
/********** Calculating elements in BioEM Probability formula ********/ // Calculating elements in BioEM Probability formula
logpro = calc_logpro(param, sum, sumsquare, crossproMapConv, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]); logpro = calc_logpro(param, sum, sumsquare, crossproMapConv, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
} }
else else
...@@ -380,14 +380,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -380,14 +380,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
else else
#endif #endif
/***** Summing & Storing total/Orientation Probabilites for each map ************/ // Summing & Storing total/Orientation Probabilites for each map
{ {
update_prob<-1>(logpro, iRefMap, iOrient, iConv, cent_x, cent_y, pProb); update_prob<-1>(logpro, iRefMap, iOrient, iConv, cent_x, cent_y, pProb);
} }
} }
template <int GPUAlgo, class RefT> template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap) __device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap)
{ {
for (int cent_x = -param.maxDisplaceCenter; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter) for (int cent_x = -param.maxDisplaceCenter; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter)
{ {
......
...@@ -10,8 +10,6 @@ using namespace std; ...@@ -10,8 +10,6 @@ using namespace std;
#include "bioem_cuda_internal.h" #include "bioem_cuda_internal.h"
#include "bioem_algorithm.h" #include "bioem_algorithm.h"
//__constant__ bioem_map gConvMap;
#define checkCudaErrors(error) \ #define checkCudaErrors(error) \
{ \ { \
if ((error) != cudaSuccess) \ if ((error) != cudaSuccess) \
...@@ -34,21 +32,21 @@ bioem_cuda::~bioem_cuda() ...@@ -34,21 +32,21 @@ bioem_cuda::~bioem_cuda()
deviceExit(); deviceExit();
} }
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef) __global__ void compareRefMap_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < maxRef) if (iRefMap < maxRef)
{ {
compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y); compareRefMap<0>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap, cent_x, cent_y);
} }
} }
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef) __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
if (iRefMap < maxRef) if (iRefMap < maxRef)
{ {
compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap); compareRefMapShifted<1>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap);
} }
} }
...@@ -61,7 +59,7 @@ __global__ void cudaZeroMem(void* ptr, size_t size) ...@@ -61,7 +59,7 @@ __global__ void cudaZeroMem(void* ptr, size_t size)
for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0; for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0;
} }
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
{ {
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
const int iRefMap = myid >> (nShiftBits << 1); const int iRefMap = myid >> (nShiftBits << 1);
...@@ -74,7 +72,7 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon ...@@ -74,7 +72,7 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef; const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); compareRefMap<2>(iRefMap,iOrient,iConv, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
} }
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset) __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset)
...@@ -89,12 +87,12 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re ...@@ -89,12 +87,12 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re
} }
} }
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef, const int Offset) __global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels]; const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels];
if (iRefMap >= maxRef) return; if (iRefMap >= maxRef) return;
doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap); doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, RefMap);
} }
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
...@@ -110,14 +108,13 @@ static inline int ilog2 (int value) ...@@ -110,14 +108,13 @@ static inline int ilog2 (int value)
static inline int ilog2(int value) {return 31 - __builtin_clz(value);} static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif #endif
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
{ {
if (startMap) if (startMap)
{ {
cout << "Error startMap not implemented for GPU Code\n"; cout << "Error startMap not implemented for GPU Code\n";
exit(1); exit(1);
} }
//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
if (GPUAsync) if (GPUAsync)
{ {
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
...@@ -125,23 +122,23 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -125,23 +122,23 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
if (FFTAlgo) if (FFTAlgo)
{ {
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream)); checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE) for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE)
{ {
const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i); const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, num, i); multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i);
if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS) if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{ {
cout << "Error running CUFFT\n"; cout << "Error running CUFFT\n";
exit(1); exit(1);
} }
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, num, i); cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
} }
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
} }
else else
{ {
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream)); checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream));
if (GPUAlgo == 2) //Loop over shifts if (GPUAlgo == 2) //Loop over shifts
{ {
...@@ -166,7 +163,7 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -166,7 +163,7 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
size_t nBlocks = CUDA_BLOCK_COUNT; size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0;i < totalBlocks;i += nBlocks) for (size_t i = 0;i < totalBlocks;i += nBlocks)
{ {
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef); compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef);
} }
} }
else if (GPUAlgo == 1) //Split shifts in multiple kernels else if (GPUAlgo == 1) //Split shifts in multiple kernels
...@@ -210,40 +207,45 @@ int bioem_cuda::deviceInit() ...@@ -210,40 +207,45 @@ int bioem_cuda::deviceInit()
if (FFTAlgo) GPUAlgo = 2; if (FFTAlgo) GPUAlgo = 2;
gpumap = new bioem_RefMap;
memcpy(gpumap, &RefMap, sizeof(bioem_RefMap));
if (FFTAlgo == 0)
{
checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize));
checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice));
}
checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap));
checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap));
checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
gpumap->maps = maps;
gpumap->sum_RefMap = sum;
gpumap->sumsquare_RefMap = sumsquare;
checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cudaStreamCreate(&cudaStream));
cout << "\tSize RefMap\t\t" << sizeof(bioem_RefMap) << "\n";
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
cout << "\tSize Probability\t" << sizeof(bioem_Probability) * RefMap.ntotRefMap << "\n";
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
for (int i = 0;i < 2;i++) for (int i = 0;i < 2;i++)
{ {
checkCudaErrors(cudaEventCreate(&cudaEvent[i])); checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map))); checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
} }
pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
if (FFTAlgo) if (FFTAlgo)
{ {
cout << "\tSize RefMapFFT\t\t" << RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t) << "\n"; checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.FFTMapSize * sizeof(mycomplex_t)));
cout << "\tSize RefMapFFT Copy\t" << CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t)));
cout << "\tSize RefMapFFT Read\t" << CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t))); checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2)); checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2));
cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice); checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice));
} }
if (GPUAlgo == 0 || GPUAlgo == 1) if (GPUAlgo == 0 || GPUAlgo == 1)
{ {
checkCudaErrors(cudaMalloc(&pRefMap_device_Mod, sizeof(bioem_RefMap_Mod)));
bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap); bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
delete RefMapGPU; delete RefMapGPU;
} }
else
{
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
}
deviceInitialized = 1; deviceInitialized = 1;
return(0); return(0);
...@@ -254,12 +256,13 @@ int bioem_cuda::deviceExit() ...@@ -254,12 +256,13 @@ int bioem_cuda::deviceExit()
if (deviceInitialized == 0) return(0); if (deviceInitialized == 0) return(0);
cudaStreamDestroy(cudaStream); cudaStreamDestroy(cudaStream);
cudaFree(pRefMap_device);
cudaFree(pProb_device); cudaFree(pProb_device);
cudaFree(sum);
cudaFree(sumsquare);
for (int i = 0;i < 2;i++)