Commit 7a018205 authored by David Rohr's avatar David Rohr

Add dual stream mode for FFT-GPU mode to improve parallelism

parent fda0cc5c
......@@ -62,6 +62,7 @@ bioem_cuda::bioem_cuda()
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
}
bioem_cuda::~bioem_cuda()
......@@ -165,24 +166,35 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
if (FFTAlgo)
{
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
for (int i = 0; i < maxRef; i += CUDA_FFTS_AT_ONCE)
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0]));
if (GPUDualStream)
{
checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
checkCudaErrors(cudaStreamWaitEvent(cudaStream[1], cudaFFTEvent[0], 0));
}
for (int i = 0, j = 0; i < maxRef; i += CUDA_FFTS_AT_ONCE, j++)
{
if (!GPUDualStream) j = 0;
const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i);
cufftResult err = mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp);
multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream[j & 1]>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2[j & 1], param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i);
cufftResult err = mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1][j & 1] : plan[0][j & 1], pFFTtmp2[j & 1], pFFTtmp[j & 1]);
if (err != CUFFT_SUCCESS)
{
cout << "Error running CUFFT " << cufftGetErrorStrung(err) << "\n";
exit(1);
}
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iOrient, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[j & 1]>>>(iOrient, iConv, pFFTtmp[j & 1], sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
}
checkCudaErrors(cudaGetLastError());
if (GPUDualStream)
{
checkCudaErrors(cudaEventRecord(cudaFFTEvent[1], cudaStream[1]));
checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaFFTEvent[1], 0));
}
}
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream));
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
if (GPUAlgo == 2) //Loop over shifts
{
......@@ -207,7 +219,7 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
size_t nBlocks = CUDA_BLOCK_COUNT;
for (size_t i = 0; i < totalBlocks; i += nBlocks)
{
compareRefMapLoopShifts_kernel<<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream >>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef);
compareRefMapLoopShifts_kernel<<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream[0] >>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef);
}
}
else if (GPUAlgo == 1) //Split shifts in multiple kernels
......@@ -216,13 +228,13 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
{
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y = cent_y + param.param_device.GridSpaceCenter)
{
compareRefMap_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, cent_x, cent_y, maxRef);
compareRefMap_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[0]>>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, cent_x, cent_y, maxRef);
}
}
}
else if (GPUAlgo == 0) //All shifts in one kernel
{
compareRefMapShifted_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, maxRef);
compareRefMapShifted_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[0]>>> (iOrient, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, maxRef);
}
else
{
......@@ -236,11 +248,11 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
}
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
}
else
{
checkCudaErrors(cudaStreamSynchronize(cudaStream));
checkCudaErrors(cudaStreamSynchronize(cudaStream[0]));
}
return(0);
}
......@@ -278,21 +290,24 @@ int bioem_cuda::deviceInit()
gpumap->sum_RefMap = sum;
gpumap->sumsquare_RefMap = sumsquare;
checkCudaErrors(cudaStreamCreate(&cudaStream));
pProb_device = pProb;
checkCudaErrors(cudaMalloc(&pProb_device.ptr, pProb_device.get_size(RefMap.ntotRefMap, param.nTotGridAngles)));
pProb_device.set_pointers();
for (int i = 0; i < 2; i++)
{
checkCudaErrors(cudaStreamCreate(&cudaStream[i]));
checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
}
if (FFTAlgo)
{
checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.FFTMapSize * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp2[0], CUDA_FFTS_AT_ONCE * param.FFTMapSize * 2 * sizeof(mycomplex_t)));
checkCudaErrors(cudaMalloc(&pFFTtmp[0], CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * 2 * sizeof(myfloat_t)));
pFFTtmp2[1] = pFFTtmp2[0] + CUDA_FFTS_AT_ONCE * param.FFTMapSize;
pFFTtmp[1] = pFFTtmp[0] + CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels;
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2));
checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice));
}
......@@ -305,21 +320,23 @@ int bioem_cuda::deviceExit()
{
if (deviceInitialized == 0) return(0);
cudaStreamDestroy(cudaStream);
cudaFree(pProb_device.ptr);
cudaFree(sum);
cudaFree(sumsquare);
for (int i = 0; i < 2; i++)
{
cudaStreamDestroy(cudaStream[i]);
cudaEventDestroy(cudaEvent[i]);
cudaEventDestroy(cudaFFTEvent[i]);
cudaFree(pConvMap_device[i]);
}
if (FFTAlgo)
{
cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT);
//cudaFree(pFFTtmp);
cudaFree(pFFTtmp2);
cudaFree(pFFTtmp[0]);
cudaFree(pFFTtmp2[0]);
}
else
{
......@@ -344,24 +361,28 @@ int bioem_cuda::deviceStartRun()
if (FFTAlgo)
{
for (int i = 0; i < 2; i++)
for (int j = 0;j < 2;j++)
{
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan[i], 2, n, NULL, 1, param.FFTMapSize, NULL, 1, 0, MY_CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan[i], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan[i], cudaStream) != CUFFT_SUCCESS)
for (int i = 0; i < 2; i++)
{
cout << "Error setting CUFFT stream\n";
exit(1);
int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
if (cufftPlanMany(&plan[i][j], 2, n, NULL, 1, param.FFTMapSize, NULL, 1, 0, MY_CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT\n";
exit(1);
}
if (cufftSetCompatibilityMode(plan[i][j], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
{
cout << "Error planning CUFFT compatibility\n";
exit(1);
}
if (cufftSetStream(plan[i][j], cudaStream[j]) != CUFFT_SUCCESS)
{
cout << "Error setting CUFFT stream\n";
exit(1);
}
}
if (!GPUDualStream) break;
}
}
return(0);
......@@ -369,12 +390,16 @@ int bioem_cuda::deviceStartRun()
int bioem_cuda::deviceFinishRun()
{
if (GPUAsync) cudaStreamSynchronize(cudaStream);
if (GPUAsync) cudaStreamSynchronize(cudaStream[0]);
cudaMemcpy(pProb.ptr, pProb_device.ptr, pProb.get_size(RefMap.ntotRefMap, param.nTotGridAngles), cudaMemcpyDeviceToHost);
if (FFTAlgo)
{
for (int i = 0; i < 2; i++) cufftDestroy(plan[i]);
for (int j = 0;j < 2;j++)
{
for (int i = 0; i < 2; i++) cufftDestroy(plan[i][j]);
if (!GPUDualStream) break;
}
}
return(0);
......
......@@ -27,8 +27,9 @@ protected:
int deviceInitialized;
cudaStream_t cudaStream;
cudaStream_t cudaStream[2];
cudaEvent_t cudaEvent[2];
cudaEvent_t cudaFFTEvent[2];
bioem_RefMap_Mod* pRefMap_device_Mod;
bioem_RefMap* gpumap;
bioem_Probability pProb_device;
......@@ -36,14 +37,15 @@ protected:
mycomplex_t* pRefMapsFFT;
mycomplex_t* pConvMapFFT;
mycuComplex_t* pFFTtmp2;
myfloat_t* pFFTtmp;
cufftHandle plan[2];
mycuComplex_t* pFFTtmp2[2];
myfloat_t* pFFTtmp[2];
cufftHandle plan[2][2];
myfloat_t *maps, *sum, *sumsquare;
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel.
int GPUDualStream; //Use two streams to improve paralelism
int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
int maxRef;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment