Commit 81fab4b2 authored by David Rohr's avatar David Rohr

split ffts for maps on gpu in smaller steps to save memory

parent e914db36
......@@ -77,11 +77,11 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
}
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps)
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset)
{
if (myBlockIdxX >= NumberMaps) return;
const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize];
mycuComplex_t* myout = &out[myBlockIdxX * MapSize];
const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize + Offset];
mycuComplex_t* myout = &out[(myBlockIdxX * MapSize)];
for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX)
{
myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1];
......@@ -89,10 +89,10 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re
}
}
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef)
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef, const int Offset)
{
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
const myfloat_t* mylCC = &lCC[iRefMap * param.NumberPixels * param.NumberPixels];
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels];
if (iRefMap >= maxRef) return;
doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap);
}
......@@ -126,14 +126,17 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
if (FFTAlgo)
{
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
multComplexMap<<<maxRef, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, maxRef);
cudaZeroMem<<<32, 256>>>(pFFTtmp, maxRef * sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
if (mycufftExecC2R(plan, pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE)
{
cout << "Error running CUFFT\n";
exit(1);
const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, num, i);
if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{
cout << "Error running CUFFT\n";
exit(1);
}
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, num, i);
}
cuDoRefMapsFFT<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, maxRef);
checkCudaErrors(cudaGetLastError());
}
else
......@@ -208,7 +211,9 @@ int bioem_cuda::deviceInit()
if (FFTAlgo) GPUAlgo = 2;
checkCudaErrors(cudaStreamCreate(&cudaStream));
cout << "\tSize RefMap\t\t" << sizeof(bioem_RefMap) << "\n";
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
cout << "\tSize Probability\t" << sizeof(bioem_Probability) * RefMap.ntotRefMap << "\n";
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
for (int i = 0;i < 2;i++)
{
......@@ -219,28 +224,14 @@ int bioem_cuda::deviceInit()
if (FFTAlgo)
{
cout << "\tSize RefMapFFT\t\t" << RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t) << "\n";
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp, RefMap.ntotRefMap * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
cout << "\tSize RefMapFFT Copy\t" << CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t)));
cout << "\tSize RefMapFFT Read\t" << CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2));
cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice);
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, RefMap.ntotRefMap) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan, cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
if (GPUAlgo == 0 || GPUAlgo == 1)
......@@ -274,9 +265,8 @@ int bioem_cuda::deviceExit()
{
cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT);
cudaFree(pFFTtmp);
//cudaFree(pFFTtmp);
cudaFree(pFFTtmp2);
cufftDestroy(plan);
}
cudaThreadExit();
......@@ -289,6 +279,29 @@ int bioem_cuda::deviceStartRun()
maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
if (FFTAlgo)
{
for (int i = 0;i < 2;i++)
{
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan[i], 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan[i], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan[i], cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
}
return(0);
}
......@@ -297,6 +310,11 @@ int bioem_cuda::deviceFinishRun()
if (GPUAsync) cudaStreamSynchronize(cudaStream);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
if (FFTAlgo)
{
for (int i = 0;i < 2;i++) cufftDestroy(plan[i]);
}
return(0);
}
......
......@@ -38,7 +38,7 @@ protected:
mycomplex_t* pConvMapFFT;
mycuComplex_t* pFFTtmp2;
myfloat_t* pFFTtmp;
cufftHandle plan;
cufftHandle plan[2];
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
......
......@@ -40,7 +40,7 @@ typedef myfloat_t mycomplex_t[2];
#define BIOEM_MAP_SIZE_X 224
#define BIOEM_MAP_SIZE_Y 224
#define BIOEM_MODEL_SIZE 120000
#define BIOEM_MAX_MAPS 4000
#define BIOEM_MAX_MAPS 8000
#define MAX_ORIENT 20000
struct myfloat3_t
......@@ -70,5 +70,6 @@ struct myfloat3_t
#define CUDA_THREAD_COUNT 256
#define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024
#define CUDA_FFTS_AT_ONCE 1024
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment