From 02123bb1a6a6d0d35a4ea024350b714430a4b330 Mon Sep 17 00:00:00 2001 From: David Rohr <drohr@jwdt.org> Date: Wed, 14 May 2014 09:36:37 +0200 Subject: [PATCH] improve async cuda transfer, improve cuda tracing --- bioem.cpp | 33 ++++++++++++++++++++++++++++++++- bioem_cuda.cu | 20 ++++++++++++++++++-- include/bioem_cuda_internal.h | 4 ++-- include/defs.h | 1 + 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/bioem.cpp b/bioem.cpp index 5531290..e92a222 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -2,7 +2,7 @@ < BioEM software for Bayesian inference of Electron Microscopy images> Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer. Max Planck Institute of Biophysics, Frankfurt, Germany. - + See license statement for terms of distribution. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -27,6 +27,29 @@ #include "model.h" #include "map.h" +#ifdef BIOEM_USE_NVTX +#include "nvToolsExt.h" + +const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; +const int num_colors = sizeof(colors)/sizeof(colors[0]); + +#define cuda_custom_timeslot(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} +#define cuda_custom_timeslot_end nvtxRangePop(); +#else +#define cuda_custom_timeslot(name,cid) +#define cuda_custom_timeslot_end +#endif #include "bioem_algorithm.h" using namespace boost; @@ -440,6 +463,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT) // ********************* and turns projection into Fourier space ************************ // ************************************************************************************** + cuda_custom_timeslot("Projection", 0); + myfloat3_t RotatedPointsModel[Model.nPointsModel]; myfloat_t rotmat[3][3]; myfloat_t alpha, gam, beta; @@ -516,6 +541,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT) // ********** Omp Critical is necessary with FFTW******* myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT); + cuda_custom_timeslot_end; + return(0); } @@ -527,6 +554,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj // *************** and Backtransforming it to real Space ******************************** // ************************************************************************************** + cuda_custom_timeslot("Convolution", 1); + mycomplex_t* tmp = localCCT[omp_get_thread_num()]; // **** Multiplying FFTmap with corresponding kernel **** @@ -559,6 +588,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj sumC = sumC / norm2; sumsquareC = sumsquareC / norm4; + cuda_custom_timeslot_end; + return(0); } diff --git a/bioem_cuda.cu b/bioem_cuda.cu index e19a844..01e6d98 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -2,7 +2,7 @@ < BioEM software for Bayesian inference of Electron Microscopy images> Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer. Max Planck Institute of Biophysics, Frankfurt, Germany. - + See license statement for terms of distribution. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -176,7 +176,12 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map if (FFTAlgo) { memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t)); - checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0])); + checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0])); + if (GPUAsync) + { + checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2])); + checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0)); + } if (GPUDualStream) { checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0])); @@ -310,6 +315,11 @@ int bioem_cuda::deviceInit() checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i])); checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize)); } + if (GPUAsync) + { + checkCudaErrors(cudaStreamCreate(&cudaStream[2])); + checkCudaErrors(cudaEventCreate(&cudaEvent[2])); + } if (FFTAlgo) { @@ -358,6 +368,12 @@ int bioem_cuda::deviceExit() { cudaFree(pRefMap_device_Mod); } + if (GPUAsync) + { + cudaStreamDestroy(cudaStream[2]); + cudaEventDestroy(cudaEvent[2]); + } + delete gpumap; cudaThreadExit(); diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index d3816f9..e3fec06 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -29,8 +29,8 @@ protected: int deviceInitialized; - cudaStream_t cudaStream[2]; - cudaEvent_t cudaEvent[2]; + cudaStream_t cudaStream[3]; + cudaEvent_t cudaEvent[3]; cudaEvent_t cudaFFTEvent[2]; bioem_RefMap_Mod* pRefMap_device_Mod; bioem_RefMap* gpumap; diff --git a/include/defs.h b/include/defs.h index 83ba1bf..8049a0e 100644 --- a/include/defs.h +++ b/include/defs.h @@ -69,6 +69,7 @@ struct myfloat3_t #define CUDA_BLOCK_COUNT 1024 * 16 #define CUDA_MAX_SHIFT_REDUCE 1024 #define CUDA_FFTS_AT_ONCE 1024 +//#define BIOEM_USE_NVTX static inline void* mallocchk(size_t size) { -- GitLab