Commit fb425c69 authored by Luka Stanisic's avatar Luka Stanisic

WATCH OUT! This is additional CUDA timing profiling activated by setting...

WATCH OUT! This is additional CUDA timing profiling activated by setting BIOEM_DEBUG_OUTPUT=4. However, this profiling is quite intrusive, as it adds additional synchronizations between GPUs and OMP that are now sequentially working on maps comparison. If the BIOEM_DEBUG_OUTPUT<4, code is ignored and the performance is back to normal
parent 38ad152a
......@@ -241,15 +241,39 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
cout << "Error startMap not implemented for GPU Code\n";
exit(1);
}
float time;
cudaEvent_t start, stop;
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
}
if (GPUAsync)
{
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
}
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
if (FFTAlgo)
{
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
......@@ -283,7 +307,14 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0) );
}
if (GPUAlgo == 2) //Loop over shifts
{
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
......@@ -330,10 +361,25 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
exit(1);
}
}
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
}
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
}
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment