diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index b00c0b5b00b3ae559413d0a1880398006b4f7c70..e92d67f5f17d290f21d156fbba61848db7408aaf 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -241,15 +241,39 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
 		cout << "Error startMap not implemented for GPU Code\n";
 		exit(1);
 	}
+
+	float time;
+	cudaEvent_t start, stop;
+	if (DebugOutput >= 4)
+	{
+		checkCudaErrors(cudaEventCreate(&start));
+		checkCudaErrors(cudaEventCreate(&stop));
+		checkCudaErrors(cudaEventRecord(start, 0));
+	}
 	if (GPUAsync)
 	{
 		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
 	}
-
+	if (DebugOutput >= 4)
+	{
+		checkCudaErrors(cudaEventRecord(stop, 0));
+		checkCudaErrors(cudaEventSynchronize(stop));
+		checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
+		printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
+		checkCudaErrors(cudaEventRecord(start, 0));
+	}
 	if (FFTAlgo)
 	{
 		memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
 		checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
+		if (DebugOutput >= 4)
+		{
+			checkCudaErrors(cudaEventRecord(stop, 0));
+			checkCudaErrors(cudaEventSynchronize(stop));
+			checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
+			printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
+			checkCudaErrors(cudaEventRecord(start, 0));
+		}
 		if (GPUAsync)
 		{
 			checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
@@ -283,7 +307,14 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
 	else
 	{
 		checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
-
+		if (DebugOutput >= 4)
+		{
+			checkCudaErrors(cudaEventRecord(stop, 0));
+			checkCudaErrors(cudaEventSynchronize(stop));
+			checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
+			printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
+			checkCudaErrors(cudaEventRecord(start, 0) );
+		}
 		if (GPUAlgo == 2) //Loop over shifts
 		{
 			const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
@@ -330,10 +361,25 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
 			exit(1);
 		}
 	}
+	if (DebugOutput >= 4)
+	{
+		checkCudaErrors(cudaEventRecord(stop, 0));
+		checkCudaErrors(cudaEventSynchronize(stop));
+		checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
+		printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
+		checkCudaErrors(cudaEventRecord(start, 0));
+	}
 	if (GPUWorkload < 100)
 	{
 		bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
 	}
+	if (DebugOutput >= 4)
+	{
+		checkCudaErrors(cudaEventRecord(stop, 0));
+		checkCudaErrors(cudaEventSynchronize(stop));
+		checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
+		printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
+	}
 	if (GPUAsync)
 	{
 		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));