Commit a6e19b48 authored by Luka Stanisic's avatar Luka Stanisic

additional information for debugging GPU performance

parent 861d145b
......@@ -238,37 +238,34 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
}
float time;
#ifdef DEBUG_GPU
cudaEvent_t start, stop;
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
}
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
}
if (DebugOutput >= 4)
{
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
#endif
if (FFTAlgo)
{
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
if (DebugOutput >= 4)
{
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
......@@ -302,14 +299,13 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
if (DebugOutput >= 4)
{
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0) );
}
#endif
if (GPUAlgo == 2) //Loop over shifts
{
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
......@@ -356,25 +352,23 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
exit(1);
}
}
if (DebugOutput >= 4)
{
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
#endif
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
}
if (DebugOutput >= 4)
{
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
}
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment