diff --git a/bioem_cuda.cu b/bioem_cuda.cu index b00c0b5b00b3ae559413d0a1880398006b4f7c70..e92d67f5f17d290f21d156fbba61848db7408aaf 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -241,15 +241,39 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t cout << "Error startMap not implemented for GPU Code\n"; exit(1); } + + float time; + cudaEvent_t start, stop; + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + } if (GPUAsync) { checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1])); } - + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000); + checkCudaErrors(cudaEventRecord(start, 0)); + } if (FFTAlgo) { memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t)); checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0])); + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000); + checkCudaErrors(cudaEventRecord(start, 0)); + } if (GPUAsync) { checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2])); @@ -283,7 +307,14 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t else { checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0])); - + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000); + checkCudaErrors(cudaEventRecord(start, 0) ); + } if (GPUAlgo == 2) //Loop over shifts { const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; @@ -330,10 +361,25 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t exit(1); } } + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000); + checkCudaErrors(cudaEventRecord(start, 0)); + } if (GPUWorkload < 100) { bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef); } + if (DebugOutput >= 4) + { + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000); + } if (GPUAsync) { checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));