diff --git a/bioem.cpp b/bioem.cpp index c5eeca600f63beb20ae1aa618e5e7a7637183347..9a1e3dbd0be75190bd2bceb112d31db3dc2cbfc7 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -57,8 +57,8 @@ int bioem::configure(int ac, char* av[]) std::string infile,modelfile,mapfile; Model.readPDB=false; param.writeAngles=false; - RefMap.dumpMap = false; - RefMap.loadMap = false; + param.dumpMap = false; + param.loadMap = false; /*************************************************************************************/ cout << " ++++++++++++ FROM COMMAND LINE +++++++++++\n\n"; @@ -93,12 +93,12 @@ int bioem::configure(int ac, char* av[]) if((ac < 6)) { std::cout << desc << std::endl; - return 0; + return 1; } if (vm.count("help")) { cout << "Usage: options_description [options]\n"; cout << desc; - return 0; + return 1; } if (vm.count("Inputfile")) @@ -123,13 +123,13 @@ int bioem::configure(int ac, char* av[]) if (vm.count("DumpMaps")) { cout << "Dumping Maps after reading from file.\n"; - RefMap.dumpMap = true; + param.dumpMap = true; } if (vm.count("LoadMapDump")) { cout << "Loading Map dump.\n"; - RefMap.loadMap = true; + param.loadMap = true; } if (vm.count("Particlesfile")) @@ -158,7 +158,7 @@ int bioem::configure(int ac, char* av[]) /********************* Reading Particle Maps Input **********************/ /********* HERE: PROBLEM if maps dont fit on the memory!! ***************/ // copying mapfile to ref map class - RefMap.filemap = mapfile.c_str(); + param.filemap = mapfile.c_str(); RefMap.readRefMaps(param); /****************** Precalculating Necessary Stuff *********************/ @@ -189,7 +189,7 @@ int bioem::precalculate() //Precalculating cross-correlations of maps for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap ; iRefMap++) { - calcross_cor(RefMap.Ref[iRefMap],sum,sumsquare); + calcross_cor(RefMap.getmap(iRefMap),sum,sumsquare); //Storing Crosscorrelations in Map class RefMap.sum_RefMap[iRefMap]=sum; RefMap.sumsquare_RefMap[iRefMap]=sumsquare; @@ -235,7 +235,7 @@ int bioem::run() /*** Declaring Private variables for each thread *****/ mycomplex_t* proj_mapFFT; - bioem_map conv_map; + myfloat_t* conv_map = new myfloat_t[param.param_device.NumberPixels * param.param_device.NumberPixels]; mycomplex_t* conv_mapFFT; myfloat_t sumCONV,sumsquareCONV; @@ -285,6 +285,7 @@ int bioem::run() //deallocating fftw_complex vector myfftw_free(proj_mapFFT); myfftw_free(conv_mapFFT); + delete[] conv_map; deviceFinishRun(); @@ -348,15 +349,11 @@ int bioem::run() param.refCTF =NULL; } - if(RefMap.RefMapsFFT) - { - delete[] RefMap.RefMapsFFT; - RefMap.RefMapsFFT = NULL; - } + RefMap.freePointers(); return(0); } -int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) +int bioem::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) { if (FFTAlgo) { @@ -394,7 +391,7 @@ int bioem::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_m inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv, myfloat_t sumC,myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC) { - const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.RefMapSize]; + const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.FFTMapSize]; for(int i = 0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) { localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1]; @@ -493,7 +490,7 @@ int bioem::createProjection(int iMap,mycomplex_t* mapFFT) return(0); } -int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,bioem_map& Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC) +int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,myfloat_t* Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC) { /**************************************************************************************/ /**** BioEM Create Convoluted Projection Map routine, multiplies in Fourier ********** @@ -508,7 +505,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b /**** Multiplying FFTmap with corresponding kernel ****/ - const mycomplex_t* refCTF = ¶m.refCTF[iConv * param.RefMapSize]; + const mycomplex_t* refCTF = ¶m.refCTF[iConv * param.FFTMapSize]; for(int i=0;i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D;i++) { localmultFFT[i][0] = lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1]; @@ -527,7 +524,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b { for(int j=0; j < param.param_device.NumberPixels ; j++ ) { - Mapconv.points[i][j]=localconvFFT[i*param.param_device.NumberPixels+j]; + Mapconv[i*param.param_device.NumberPixels+j] = localconvFFT[i*param.param_device.NumberPixels+j]; } } @@ -553,7 +550,7 @@ int bioem::createConvolutedProjectionMap(int iMap,int iConv,mycomplex_t* lproj,b return(0); } -int bioem::calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare) +int bioem::calcross_cor(myfloat_t* localmap,myfloat_t& sum,myfloat_t& sumsquare) { /*********************** Routine to calculate Cross correlations***********************/ @@ -564,9 +561,9 @@ int bioem::calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare) for (int j = 0; j < param.param_device.NumberPixels; j++) { // Calculate Sum of pixels - sum += localmap.points[i][j]; + sum += localmap[i*param.param_device.NumberPixels+j]; // Calculate Sum of pixels squared - sumsquare += localmap.points[i][j]*localmap.points[i][j]; + sumsquare += localmap[i*param.param_device.NumberPixels+j] * localmap[i*param.param_device.NumberPixels+j]; } } return(0); diff --git a/bioem_algorithm.h b/bioem_algorithm.h index 5640cec6f902ed5956b722e9ebc9a05e311b5ff8..f62127f6c2102ba478de348ba60a587c21bea7ee 100644 --- a/bioem_algorithm.h +++ b/bioem_algorithm.h @@ -131,16 +131,16 @@ __device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, } template <int GPUAlgo, class RefT> -__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, +__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true) { - /**************************************************************************************/ - /********************** Calculating BioEM Probability ********************************/ - /************************* Loop of center displacement here ***************************/ + + // ********************** Calculating BioEM Probability ******************************** + // ************************* Loop of center displacement here *************************** // Taking into account the center displacement - /*** Inizialzing crosscorrelations of calculated projected convolutions ***/ + // Inizialzing crosscorrelations of calculated projected convolutions #ifdef SSECODE myfloat_t sum, sumsquare, crossproMapConv; __m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2; @@ -149,7 +149,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient myfloat_t sumsquare=0.0; myfloat_t crossproMapConv=0.0; #endif - /****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/ + // Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map myfloat_t logpro; if (GPUAlgo != 2 || threadActive) { @@ -195,7 +195,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient #else for (int j = jStart; j < jEnd; j += 1) { - const myfloat_t pointMap = Mapconv.points[i + cent_x][j + cent_y]; + const myfloat_t pointMap = Mapconv[(i + cent_x) * param.NumberPixels + j + cent_y]; const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j); crossproMapConv += pointMap * pointRefMap; // Crosscorrelation of calculated displaced map @@ -217,7 +217,7 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient crossproMapConv = _mm_cvtss_f32(cross_v); #endif - /********** Calculating elements in BioEM Probability formula ********/ + // Calculating elements in BioEM Probability formula logpro = calc_logpro(param, sum, sumsquare, crossproMapConv, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]); } else @@ -380,14 +380,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient else #endif - /***** Summing & Storing total/Orientation Probabilites for each map ************/ + // Summing & Storing total/Orientation Probabilites for each map { update_prob<-1>(logpro, iRefMap, iOrient, iConv, cent_x, cent_y, pProb); } } template <int GPUAlgo, class RefT> -__device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap) +__device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv, const myfloat_t* Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap) { for (int cent_x = -param.maxDisplaceCenter; cent_x <= param.maxDisplaceCenter; cent_x=cent_x+param.GridSpaceCenter) { diff --git a/bioem_cuda.cu b/bioem_cuda.cu index 4e2439754bd4df795d08885728d68ab61623d3fc..60c34e53104ef41ad5a100b1d42f5c11bc031411 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -10,8 +10,6 @@ using namespace std; #include "bioem_cuda_internal.h" #include "bioem_algorithm.h" -//__constant__ bioem_map gConvMap; - #define checkCudaErrors(error) \ { \ if ((error) != cudaSuccess) \ @@ -34,21 +32,21 @@ bioem_cuda::~bioem_cuda() deviceExit(); } -__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef) +__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { - compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y); + compareRefMap<0>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap, cent_x, cent_y); } } -__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef) +__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; if (iRefMap < maxRef) { - compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap); + compareRefMapShifted<1>(iRefMap,iOrient,iConv, pMap, pProb, param, *RefMap); } } @@ -61,7 +59,7 @@ __global__ void cudaZeroMem(void* ptr, size_t size) for (int i = myid;i < mysize;i += mygrid) myptr[i] = 0; } -__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) +__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef) { const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; const int iRefMap = myid >> (nShiftBits << 1); @@ -74,7 +72,7 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef; - compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); + compareRefMap<2>(iRefMap,iOrient,iConv, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); } __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset) @@ -89,12 +87,12 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re } } -__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef, const int Offset) +__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset) { const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset; const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels]; if (iRefMap >= maxRef) return; - doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap); + doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, RefMap); } template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);} @@ -110,14 +108,13 @@ static inline int ilog2 (int value) static inline int ilog2(int value) {return 31 - __builtin_clz(value);} #endif -int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) +int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap) { if (startMap) { cout << "Error startMap not implemented for GPU Code\n"; exit(1); } - //cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream); if (GPUAsync) { checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); @@ -125,23 +122,23 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c if (FFTAlgo) { - checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream)); + checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream)); for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE) { const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i); - multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, num, i); + multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i); if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS) { cout << "Error running CUFFT\n"; exit(1); } - cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, num, i); + cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i); } checkCudaErrors(cudaGetLastError()); } else { - checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream)); + checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream)); if (GPUAlgo == 2) //Loop over shifts { @@ -166,7 +163,7 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c size_t nBlocks = CUDA_BLOCK_COUNT; for (size_t i = 0;i < totalBlocks;i += nBlocks) { - compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef); + compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef); } } else if (GPUAlgo == 1) //Split shifts in multiple kernels @@ -210,40 +207,45 @@ int bioem_cuda::deviceInit() if (FFTAlgo) GPUAlgo = 2; + gpumap = new bioem_RefMap; + memcpy(gpumap, &RefMap, sizeof(bioem_RefMap)); + if (FFTAlgo == 0) + { + checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize)); + checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice)); + } + checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap)); + checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap)); + checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice)); + gpumap->maps = maps; + gpumap->sum_RefMap = sum; + gpumap->sumsquare_RefMap = sumsquare; + checkCudaErrors(cudaStreamCreate(&cudaStream)); - cout << "\tSize RefMap\t\t" << sizeof(bioem_RefMap) << "\n"; - checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap))); - cout << "\tSize Probability\t" << sizeof(bioem_Probability) * RefMap.ntotRefMap << "\n"; checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); for (int i = 0;i < 2;i++) { checkCudaErrors(cudaEventCreate(&cudaEvent[i])); - checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map))); + checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize)); } - pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device; if (FFTAlgo) { - cout << "\tSize RefMapFFT\t\t" << RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t) << "\n"; - checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t))); - cout << "\tSize RefMapFFT Copy\t" << CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t) << "\n"; - checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t))); - cout << "\tSize RefMapFFT Read\t" << CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t) << "\n"; + checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t))); + checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.FFTMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t))); - checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2)); - cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice); + checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2)); + checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice)); } if (GPUAlgo == 0 || GPUAlgo == 1) { + checkCudaErrors(cudaMalloc(&pRefMap_device_Mod, sizeof(bioem_RefMap_Mod))); bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap); cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); delete RefMapGPU; } - else - { - cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); - } deviceInitialized = 1; return(0); @@ -254,12 +256,13 @@ int bioem_cuda::deviceExit() if (deviceInitialized == 0) return(0); cudaStreamDestroy(cudaStream); - cudaFree(pRefMap_device); cudaFree(pProb_device); + cudaFree(sum); + cudaFree(sumsquare); for (int i = 0;i < 2;i++) { cudaEventDestroy(cudaEvent[i]); - cudaFree(pConvMap_device); + cudaFree(pConvMap_device[i]); } if (FFTAlgo) { @@ -268,6 +271,15 @@ int bioem_cuda::deviceExit() //cudaFree(pFFTtmp); cudaFree(pFFTtmp2); } + else + { + cudaFree(maps); + } + if (GPUAlgo == 0 || GPUAlgo == 1) + { + cudaFree(pRefMap_device_Mod); + } + delete gpumap; cudaThreadExit(); deviceInitialized = 0; diff --git a/include/bioem.h b/include/bioem.h index 18929476a204a9475a31662d403a56f83cfdab82..4295794e29f545ff89449f8403d1433eaf9a9f26 100644 --- a/include/bioem.h +++ b/include/bioem.h @@ -21,12 +21,12 @@ public: int dopreCalCrossCorrelation(int iRefMap, int iRefMapLocal); int run(); int doProjections(int iMap); - int createConvolutedProjectionMap(int iOreint,int iMap, mycomplex_t* lproj,bioem_map& Mapconv,mycomplex_t* localmultFFT,myfloat_t& sumC,myfloat_t& sumsquareC); + int createConvolutedProjectionMap(int iOreint,int iMap, mycomplex_t* lproj, myfloat_t* Mapconv,mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC); - virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); + virtual int compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); int createProjection(int iMap, mycomplex_t* map); - int calcross_cor(bioem_map& localmap,myfloat_t& sum,myfloat_t& sumsquare); + int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare); void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT,mycomplex_t* localCCT,myfloat_t* lCC); bioem_Probability* pProb; diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index fa34cedc5e56c5ea61331c2c7b56d436a5b9d71a..204df1bf2c666772dca6c59b0730f298b84a510e 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -17,7 +17,7 @@ public: bioem_cuda(); virtual ~bioem_cuda(); - virtual int compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); + virtual int compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); protected: virtual int deviceInit(); @@ -30,9 +30,9 @@ protected: cudaStream_t cudaStream; cudaEvent_t cudaEvent[2]; bioem_RefMap_Mod* pRefMap_device_Mod; - bioem_RefMap* pRefMap_device; + bioem_RefMap* gpumap; bioem_Probability* pProb_device; - bioem_map* pConvMap_device[2]; + myfloat_t* pConvMap_device[2]; mycomplex_t* pRefMapsFFT; mycomplex_t* pConvMapFFT; @@ -40,6 +40,8 @@ protected: myfloat_t* pFFTtmp; cufftHandle plan[2]; + myfloat_t *maps, *sum, *sumsquare; + int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best) int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel. int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel. diff --git a/include/defs.h b/include/defs.h index 20dfcbc243dbd49d970dff1f91466fbb1ab2acd9..91eb353cd0bf4df89b70e9f93cb260ed43727a64 100644 --- a/include/defs.h +++ b/include/defs.h @@ -72,4 +72,26 @@ struct myfloat3_t #define CUDA_MAX_SHIFT_REDUCE 1024 #define CUDA_FFTS_AT_ONCE 1024 +static inline void* mallocchk(size_t size) +{ + void* ptr = malloc(size); + if (ptr == 0) + { + std::cout << "Memory allocation error\n"; + exit(1); + } + return(ptr); +} + +static inline void* reallocchk(void* oldptr, size_t size) +{ + void* ptr = realloc(oldptr, size); + if (ptr == 0) + { + std::cout << "Memory allocation error\n"; + exit(1); + } + return(ptr); +} + #endif diff --git a/include/map.h b/include/map.h index f801d166810c446ab17e83ffb543c58be699abc1..f97a650e84109711967999a236a658ff8e9fde8d 100644 --- a/include/map.h +++ b/include/map.h @@ -7,17 +7,26 @@ class bioem_param; -class bioem_map -{ -public: - myfloat_t points[BIOEM_MAP_SIZE_X][BIOEM_MAP_SIZE_Y]; -}; - class bioem_RefMap { public: bioem_RefMap() { + maps = NULL; + RefMapsFFT = NULL; + sum_RefMap = NULL; + sumsquare_RefMap = NULL; + } + + void freePointers() + { + if (maps) free(maps); + if (sum_RefMap) free(sum_RefMap); + if (sumsquare_RefMap) free(sumsquare_RefMap); + if (RefMapsFFT) delete[] RefMapsFFT; + maps = NULL; + sum_RefMap = NULL; + sumsquare_RefMap = NULL; RefMapsFFT = NULL; } int readRefMaps(bioem_param& param); @@ -25,29 +34,25 @@ public: mycomplex_t* RefMapsFFT; - const char* filemap; int ntotRefMap; - bioem_map Ref[BIOEM_MAX_MAPS]; - myfloat_t sum_RefMap[BIOEM_MAX_MAPS]; - myfloat_t sumsquare_RefMap[BIOEM_MAX_MAPS]; - myfloat_t ForLogProbfromRef[BIOEM_MAX_MAPS]; + int numPixels; + int refMapSize; + myfloat_t* maps; + myfloat_t* sum_RefMap; + myfloat_t* sumsquare_RefMap; - bool dumpMap, loadMap; - - __host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(Ref[map].points[x][y]);} - __host__ __device__ inline const myfloat_t* getp(int map, int x, int y) const {return(&Ref[map].points[x][y]);} + __host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(maps[map * refMapSize + x * numPixels + y]);} + __host__ __device__ inline const myfloat_t* getp(int map, int x, int y) const {return(&maps[map * refMapSize + x * numPixels]);} + __host__ __device__ inline myfloat_t* getmap(int map) {return(&maps[map * refMapSize]);} }; class bioem_RefMap_Mod { public: - - const char* filemap; int ntotRefMap; myfloat_t Ref[BIOEM_MAP_SIZE_X][BIOEM_MAP_SIZE_Y][BIOEM_MAX_MAPS]; myfloat_t sum_RefMap[BIOEM_MAX_MAPS]; myfloat_t sumsquare_RefMap[BIOEM_MAX_MAPS]; - myfloat_t ForLogProbfromRef[BIOEM_MAX_MAPS]; __host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(Ref[x][y][map]);} @@ -58,7 +63,6 @@ public: ntotRefMap = map.ntotRefMap; memcpy(sum_RefMap, map.sum_RefMap, sizeof(sum_RefMap)); memcpy(sumsquare_RefMap, map.sumsquare_RefMap, sizeof(sumsquare_RefMap)); - memcpy(ForLogProbfromRef, map.ForLogProbfromRef, sizeof(ForLogProbfromRef)); #pragma omp parallel for for (int i = 0;i < ntotRefMap;i++) { @@ -83,4 +87,5 @@ public: myfloat_t max_prob; int max_prob_cent_x, max_prob_cent_y, max_prob_orient, max_prob_conv; }; + #endif diff --git a/include/param.h b/include/param.h index 5b85e47e947d32f8905848d25d1495dd1c1400d6..33564ec95ff483bfdcc995b18e608dd107166e93 100644 --- a/include/param.h +++ b/include/param.h @@ -32,7 +32,7 @@ public: bioem_param_device param_device; - int RefMapSize; + int FFTMapSize; mycomplex_t* refCTF; myfloat3_t* CtfParam; @@ -73,6 +73,9 @@ public: int fft_plans_created; myfftw_plan fft_plan_c2c_forward, fft_plan_c2c_backward, fft_plan_r2c_forward, fft_plan_c2r_backward; + const char* filemap; + bool dumpMap, loadMap; + private: void releaseFFTPlans(); }; diff --git a/main.cpp b/main.cpp index cf8b518803b09de0c2ce9e5d8c0cc058cd34e471..edcb7cb1016a498f317d3276f5dc2a896e2c9b1e 100644 --- a/main.cpp +++ b/main.cpp @@ -45,7 +45,7 @@ int main(int argc, char* argv[]) /************ Configuration and Pre-calculating necessary objects *****************/ printf("Configuring\n"); - bio->configure(argc,argv); + if (bio->configure(argc,argv)) return(1); /******************************* Run BioEM routine ******************************/ printf("Running\n"); diff --git a/map.cpp b/map.cpp index 7e1039751327323b1441c4c9aa531dd896f9f286..3ef06d206cb5fa7d39eb4476dda9e157a6234f57 100644 --- a/map.cpp +++ b/map.cpp @@ -13,10 +13,13 @@ using namespace std; int bioem_RefMap::readRefMaps(bioem_param& param) { + numPixels = param.param_device.NumberPixels; + refMapSize = param.param_device.NumberPixels * param.param_device.NumberPixels; /**************************************************************************************/ /***********************Reading reference Particle Maps************************/ /**************************************************************************************/ - if (loadMap) + int allocsize = 0; + if (param.loadMap) { FILE* fp = fopen("maps.dump", "rb"); if (fp == NULL) @@ -25,12 +28,8 @@ int bioem_RefMap::readRefMaps(bioem_param& param) exit(1); } fread(&ntotRefMap, sizeof(ntotRefMap), 1, fp); - if (ntotRefMap > BIOEM_MAX_MAPS) - { - cout << "BIOEM_MAX_MAPS too small, some maps dropped\n"; - ntotRefMap = BIOEM_MAX_MAPS; - } - fread(&Ref[0], sizeof(Ref[0]), ntotRefMap, fp); + maps = (myfloat_t*) mallocchk(ntotRefMap * refMapSize * sizeof(myfloat_t)); + fread(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp); fclose(fp); cout << "Particle Maps read from Map Dump \nTotal Number of particles: " << ntotRefMap ; @@ -41,7 +40,7 @@ int bioem_RefMap::readRefMaps(bioem_param& param) int nummap=-1; int lasti=0; int lastj=0; - ifstream input(filemap); + ifstream input(param.filemap); if (!input.good()) { cout << "Particle Maps Failed to open file" << endl ; @@ -61,6 +60,17 @@ int bioem_RefMap::readRefMaps(bioem_param& param) if (strcmp(token,"PARTICLE")==0) // to count the number of maps { nummap++; + if (allocsize == 0) + { + allocsize = 64; + maps = (myfloat_t*) mallocchk(refMapSize * sizeof(myfloat_t) * allocsize); + + } + else if (nummap + 1 >= allocsize) + { + allocsize *= 2; + maps = (myfloat_t*) reallocchk(maps, refMapSize * sizeof(myfloat_t) * allocsize); + } if (nummap % 128 == 0) { cout << "..." << nummap << "\n"; @@ -94,7 +104,7 @@ int bioem_RefMap::readRefMaps(bioem_param& param) //checking for Map limits if(i>0 && i-1 <BIOEM_MAP_SIZE_X&& j>0 && j-1<BIOEM_MAP_SIZE_Y && nummap < BIOEM_MAX_MAPS) { - Ref[nummap].points[i-1][j-1] = (myfloat_t) z; + maps[nummap * refMapSize + (i-1) * numPixels + (j-1)] = (myfloat_t) z; lasti=i; lastj=j; } @@ -107,10 +117,11 @@ int bioem_RefMap::readRefMaps(bioem_param& param) } cout << "."; ntotRefMap=nummap+1; + maps = (myfloat_t*) reallocchk(maps, refMapSize * sizeof(myfloat_t) * ntotRefMap); cout << "Particle Maps read from Standard File \nTotal Number of particles: " << ntotRefMap ; cout << "\n+++++++++++++++++++++++++++++++++++++++++ \n"; - if (dumpMap) + if (param.dumpMap) { FILE* fp = fopen("maps.dump", "w+b"); if (fp == NULL) @@ -119,11 +130,14 @@ int bioem_RefMap::readRefMaps(bioem_param& param) exit(1); } fwrite(&ntotRefMap, sizeof(ntotRefMap), 1, fp); - fwrite(&Ref[0], sizeof(Ref[0]), ntotRefMap, fp); + fwrite(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp); fclose(fp); } } + sum_RefMap = (myfloat_t*) mallocchk(sizeof(myfloat_t) * ntotRefMap); + sumsquare_RefMap = (myfloat_t*) mallocchk(sizeof(myfloat_t) * ntotRefMap); + return(0); } @@ -137,7 +151,7 @@ int bioem_RefMap::PreCalculateMapsFFT(bioem_param& param) localMap= (myfloat_t *) myfftw_malloc(sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels); - RefMapsFFT = new mycomplex_t[ntotRefMap * param.RefMapSize]; + RefMapsFFT = new mycomplex_t[ntotRefMap * param.FFTMapSize]; mycomplex_t* localout; localout= (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D); @@ -149,7 +163,7 @@ int bioem_RefMap::PreCalculateMapsFFT(bioem_param& param) { for(int j=0; j< param.param_device.NumberPixels; j++) { - localMap[i*param.param_device.NumberPixels+j]=Ref[iRefMap].points[i][j]; + localMap[i * param.param_device.NumberPixels+j] = maps[iRefMap * refMapSize + i * param.param_device.NumberPixels + j]; } } @@ -157,7 +171,7 @@ int bioem_RefMap::PreCalculateMapsFFT(bioem_param& param) myfftw_execute_dft_r2c(param.fft_plan_r2c_forward,localMap,localout); // Normalizing and saving the Reference CTFs - mycomplex_t* RefMap = &RefMapsFFT[iRefMap * param.RefMapSize]; + mycomplex_t* RefMap = &RefMapsFFT[iRefMap * param.FFTMapSize]; for(int i=0; i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D ; i++ ) { RefMap[i][0]=localout[i][0]; diff --git a/param.cpp b/param.cpp index ae189ab9f12efb560e185feb97cb74f0192b469d..64e21787c318fe328d68c16b64ac29cbba15095c 100644 --- a/param.cpp +++ b/param.cpp @@ -180,7 +180,7 @@ int bioem_param::readParameters() } input.close(); param_device.NumberFFTPixels1D = param_device.NumberPixels / 2 + 1; - RefMapSize = param_device.NumberPixels * param_device.NumberFFTPixels1D; + FFTMapSize = param_device.NumberPixels * param_device.NumberFFTPixels1D; cout << " +++++++++++++++++++++++++++++++++++++++++ \n"; cout << "Preparing FFTs\n"; @@ -278,7 +278,7 @@ int bioem_param::CalculateRefCTF() nTotCTFs = numberGridPointsCTF_amp * numberGridPointsCTF_phase * numberGridPointsEnvelop; delete[] refCTF; - refCTF = new mycomplex_t[nTotCTFs * RefMapSize]; + refCTF = new mycomplex_t[nTotCTFs * FFTMapSize]; delete[] CtfParam; CtfParam = new myfloat3_t[nTotCTFs]; @@ -317,7 +317,7 @@ int bioem_param::CalculateRefCTF() myfftw_execute_dft_r2c(fft_plan_r2c_forward,localCTF,localout); // Normalizing and saving the Reference CTFs - mycomplex_t* curRef = &refCTF[n * RefMapSize]; + mycomplex_t* curRef = &refCTF[n * FFTMapSize]; for(int i=0; i < param_device.NumberPixels * param_device.NumberFFTPixels1D; i++ ) { curRef[i][0] = localout[i][0];