Commit a6e19b48 authored by Luka Stanisic's avatar Luka Stanisic

additional information for debugging GPU performance

parent 861d145b
...@@ -238,37 +238,34 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t ...@@ -238,37 +238,34 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
} }
float time; float time;
#ifdef DEBUG_GPU
cudaEvent_t start, stop; cudaEvent_t start, stop;
if (DebugOutput >= 4) checkCudaErrors(cudaEventCreate(&start));
{ checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventCreate(&stop)); #endif
checkCudaErrors(cudaEventRecord(start, 0));
}
if (GPUAsync) if (GPUAsync)
{ {
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
} }
if (DebugOutput >= 4) #ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
#endif
if (FFTAlgo)
{ {
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000); printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaEventRecord(start, 0));
} #endif
if (FFTAlgo)
{
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
if (DebugOutput >= 4)
{
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
}
if (GPUAsync) if (GPUAsync)
{ {
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2])); checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
...@@ -302,14 +299,13 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t ...@@ -302,14 +299,13 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
else else
{ {
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0])); checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
if (DebugOutput >= 4) #ifdef DEBUG_GPU
{ checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000); checkCudaErrors(cudaEventRecord(start, 0) );
checkCudaErrors(cudaEventRecord(start, 0) ); #endif
}
if (GPUAlgo == 2) //Loop over shifts if (GPUAlgo == 2) //Loop over shifts
{ {
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
...@@ -356,25 +352,23 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t ...@@ -356,25 +352,23 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
exit(1); exit(1);
} }
} }
if (DebugOutput >= 4) #ifdef DEBUG_GPU
{ checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000); checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventRecord(start, 0)); #endif
}
if (GPUWorkload < 100) if (GPUWorkload < 100)
{ {
bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef); bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
} }
if (DebugOutput >= 4) #ifdef DEBUG_GPU
{ checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000); #endif
}
if (GPUAsync) if (GPUAsync)
{ {
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0])); checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment