Commit 81fab4b2 authored by David Rohr's avatar David Rohr
Browse files

split ffts for maps on gpu in smaller steps to save memory

parent e914db36
...@@ -77,11 +77,11 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon ...@@ -77,11 +77,11 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive); compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
} }
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps) __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset)
{ {
if (myBlockIdxX >= NumberMaps) return; if (myBlockIdxX >= NumberMaps) return;
const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize]; const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize + Offset];
mycuComplex_t* myout = &out[myBlockIdxX * MapSize]; mycuComplex_t* myout = &out[(myBlockIdxX * MapSize)];
for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX) for(int i = myThreadIdxX;i < NumberPixelsTotal;i += myBlockDimX)
{ {
myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1]; myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1];
...@@ -89,10 +89,10 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re ...@@ -89,10 +89,10 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re
} }
} }
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef) __global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int maxRef, const int Offset)
{ {
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
const myfloat_t* mylCC = &lCC[iRefMap * param.NumberPixels * param.NumberPixels]; const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels];
if (iRefMap >= maxRef) return; if (iRefMap >= maxRef) return;
doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap); doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, *RefMap);
} }
...@@ -126,14 +126,17 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c ...@@ -126,14 +126,17 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
if (FFTAlgo) if (FFTAlgo)
{ {
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream)); checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.RefMapSize], localmultFFT, param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
multComplexMap<<<maxRef, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, maxRef); for (int i = 0;i < maxRef;i += CUDA_FFTS_AT_ONCE)
cudaZeroMem<<<32, 256>>>(pFFTtmp, maxRef * sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
if (mycufftExecC2R(plan, pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{ {
cout << "Error running CUFFT\n"; const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
exit(1); multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.RefMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.RefMapSize, num, i);
if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
{
cout << "Error running CUFFT\n";
exit(1);
}
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, num, i);
} }
cuDoRefMapsFFT<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, pRefMap_device, maxRef);
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
} }
else else
...@@ -208,7 +211,9 @@ int bioem_cuda::deviceInit() ...@@ -208,7 +211,9 @@ int bioem_cuda::deviceInit()
if (FFTAlgo) GPUAlgo = 2; if (FFTAlgo) GPUAlgo = 2;
checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cudaStreamCreate(&cudaStream));
cout << "\tSize RefMap\t\t" << sizeof(bioem_RefMap) << "\n";
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap))); checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
cout << "\tSize Probability\t" << sizeof(bioem_Probability) * RefMap.ntotRefMap << "\n";
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
for (int i = 0;i < 2;i++) for (int i = 0;i < 2;i++)
{ {
...@@ -219,28 +224,14 @@ int bioem_cuda::deviceInit() ...@@ -219,28 +224,14 @@ int bioem_cuda::deviceInit()
if (FFTAlgo) if (FFTAlgo)
{ {
cout << "\tSize RefMapFFT\t\t" << RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t) << "\n";
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t))); checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t))); cout << "\tSize RefMapFFT Copy\t" << CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp, RefMap.ntotRefMap * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t))); checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.RefMapSize * sizeof(mycomplex_t)));
cout << "\tSize RefMapFFT Read\t" << CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t) << "\n";
checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2)); checkCudaErrors(cudaMalloc(&pConvMapFFT, param.RefMapSize * sizeof(mycomplex_t) * 2));
cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice); cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.RefMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice);
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, RefMap.ntotRefMap) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan, cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
} }
if (GPUAlgo == 0 || GPUAlgo == 1) if (GPUAlgo == 0 || GPUAlgo == 1)
...@@ -274,9 +265,8 @@ int bioem_cuda::deviceExit() ...@@ -274,9 +265,8 @@ int bioem_cuda::deviceExit()
{ {
cudaFree(pRefMapsFFT); cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT); cudaFree(pConvMapFFT);
cudaFree(pFFTtmp); //cudaFree(pFFTtmp);
cudaFree(pFFTtmp2); cudaFree(pFFTtmp2);
cufftDestroy(plan);
} }
cudaThreadExit(); cudaThreadExit();
...@@ -289,6 +279,29 @@ int bioem_cuda::deviceStartRun() ...@@ -289,6 +279,29 @@ int bioem_cuda::deviceStartRun()
maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100); maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice); cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
if (FFTAlgo)
{
for (int i = 0;i < 2;i++)
{
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan[i], 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan[i], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan[i], cudaStream) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
}
return(0); return(0);
} }
...@@ -297,6 +310,11 @@ int bioem_cuda::deviceFinishRun() ...@@ -297,6 +310,11 @@ int bioem_cuda::deviceFinishRun()
if (GPUAsync) cudaStreamSynchronize(cudaStream); if (GPUAsync) cudaStreamSynchronize(cudaStream);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost); cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
if (FFTAlgo)
{
for (int i = 0;i < 2;i++) cufftDestroy(plan[i]);
}
return(0); return(0);
} }
......
...@@ -38,7 +38,7 @@ protected: ...@@ -38,7 +38,7 @@ protected:
mycomplex_t* pConvMapFFT; mycomplex_t* pConvMapFFT;
mycuComplex_t* pFFTtmp2; mycuComplex_t* pFFTtmp2;
myfloat_t* pFFTtmp; myfloat_t* pFFTtmp;
cufftHandle plan; cufftHandle plan[2];
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best) int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel. int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
......
...@@ -40,7 +40,7 @@ typedef myfloat_t mycomplex_t[2]; ...@@ -40,7 +40,7 @@ typedef myfloat_t mycomplex_t[2];
#define BIOEM_MAP_SIZE_X 224 #define BIOEM_MAP_SIZE_X 224
#define BIOEM_MAP_SIZE_Y 224 #define BIOEM_MAP_SIZE_Y 224
#define BIOEM_MODEL_SIZE 120000 #define BIOEM_MODEL_SIZE 120000
#define BIOEM_MAX_MAPS 4000 #define BIOEM_MAX_MAPS 8000
#define MAX_ORIENT 20000 #define MAX_ORIENT 20000
struct myfloat3_t struct myfloat3_t
...@@ -70,5 +70,6 @@ struct myfloat3_t ...@@ -70,5 +70,6 @@ struct myfloat3_t
#define CUDA_THREAD_COUNT 256 #define CUDA_THREAD_COUNT 256
#define CUDA_BLOCK_COUNT 1024 * 16 #define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024 #define CUDA_MAX_SHIFT_REDUCE 1024
#define CUDA_FFTS_AT_ONCE 1024
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment