From 02123bb1a6a6d0d35a4ea024350b714430a4b330 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 14 May 2014 09:36:37 +0200
Subject: [PATCH] improve async cuda transfer, improve cuda tracing

---
 bioem.cpp                     | 33 ++++++++++++++++++++++++++++++++-
 bioem_cuda.cu                 | 20 ++++++++++++++++++--
 include/bioem_cuda_internal.h |  4 ++--
 include/defs.h                |  1 +
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/bioem.cpp b/bioem.cpp
index 5531290..e92a222 100644
--- a/bioem.cpp
+++ b/bioem.cpp
@@ -2,7 +2,7 @@
         < BioEM software for Bayesian inference of Electron Microscopy images>
             Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
             Max Planck Institute of Biophysics, Frankfurt, Germany.
- 
+
                 See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -27,6 +27,29 @@
 #include "model.h"
 #include "map.h"
 
+#ifdef BIOEM_USE_NVTX
+#include "nvToolsExt.h"
+
+const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
+const int num_colors = sizeof(colors)/sizeof(colors[0]);
+
+#define cuda_custom_timeslot(name,cid) { \
+	int color_id = cid; \
+	color_id = color_id%num_colors;\
+	nvtxEventAttributes_t eventAttrib = {0}; \
+	eventAttrib.version = NVTX_VERSION; \
+	eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+	eventAttrib.colorType = NVTX_COLOR_ARGB; \
+	eventAttrib.color = colors[color_id]; \
+	eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+	eventAttrib.message.ascii = name; \
+	nvtxRangePushEx(&eventAttrib); \
+}
+#define cuda_custom_timeslot_end nvtxRangePop();
+#else
+#define cuda_custom_timeslot(name,cid)
+#define cuda_custom_timeslot_end
+#endif
 #include "bioem_algorithm.h"
 
 using namespace boost;
@@ -440,6 +463,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
 	// ********************* and turns projection into Fourier space ************************
 	// **************************************************************************************
 
+	cuda_custom_timeslot("Projection", 0);
+
 	myfloat3_t RotatedPointsModel[Model.nPointsModel];
 	myfloat_t rotmat[3][3];
 	myfloat_t alpha, gam, beta;
@@ -516,6 +541,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
 	// ********** Omp Critical is necessary with FFTW*******
 	myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT);
 
+	cuda_custom_timeslot_end;
+
 	return(0);
 }
 
@@ -527,6 +554,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
 	// *************** and Backtransforming it to real Space ********************************
 	// **************************************************************************************
 
+	cuda_custom_timeslot("Convolution", 1);
+
 	mycomplex_t* tmp = localCCT[omp_get_thread_num()];
 
 	// **** Multiplying FFTmap with corresponding kernel ****
@@ -559,6 +588,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
 	sumC = sumC / norm2;
 	sumsquareC = sumsquareC / norm4;
 
+	cuda_custom_timeslot_end;
+
 	return(0);
 }
 
diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index e19a844..01e6d98 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -2,7 +2,7 @@
         < BioEM software for Bayesian inference of Electron Microscopy images>
             Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
             Max Planck Institute of Biophysics, Frankfurt, Germany.
- 
+
                 See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -176,7 +176,12 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
 	if (FFTAlgo)
 	{
 		memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
-		checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0]));
+		checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
+		if (GPUAsync)
+		{
+			checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
+			checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0));
+		}
 		if (GPUDualStream)
 		{
 			checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
@@ -310,6 +315,11 @@ int bioem_cuda::deviceInit()
 		checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
 		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
 	}
+	if (GPUAsync)
+	{
+		checkCudaErrors(cudaStreamCreate(&cudaStream[2]));
+		checkCudaErrors(cudaEventCreate(&cudaEvent[2]));
+	}
 
 	if (FFTAlgo)
 	{
@@ -358,6 +368,12 @@ int bioem_cuda::deviceExit()
 	{
 		cudaFree(pRefMap_device_Mod);
 	}
+	if (GPUAsync)
+	{
+		cudaStreamDestroy(cudaStream[2]);
+		cudaEventDestroy(cudaEvent[2]);
+	}
+
 	delete gpumap;
 	cudaThreadExit();
 
diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h
index d3816f9..e3fec06 100644
--- a/include/bioem_cuda_internal.h
+++ b/include/bioem_cuda_internal.h
@@ -29,8 +29,8 @@ protected:
 
 	int deviceInitialized;
 
-	cudaStream_t cudaStream[2];
-	cudaEvent_t cudaEvent[2];
+	cudaStream_t cudaStream[3];
+	cudaEvent_t cudaEvent[3];
 	cudaEvent_t cudaFFTEvent[2];
 	bioem_RefMap_Mod* pRefMap_device_Mod;
 	bioem_RefMap* gpumap;
diff --git a/include/defs.h b/include/defs.h
index 83ba1bf..8049a0e 100644
--- a/include/defs.h
+++ b/include/defs.h
@@ -69,6 +69,7 @@ struct myfloat3_t
 #define CUDA_BLOCK_COUNT 1024 * 16
 #define CUDA_MAX_SHIFT_REDUCE 1024
 #define CUDA_FFTS_AT_ONCE 1024
+//#define BIOEM_USE_NVTX
 
 static inline void* mallocchk(size_t size)
 {
-- 
GitLab