Commit 02123bb1 authored by David Rohr's avatar David Rohr

improve async cuda transfer, improve cuda tracing

parent ba4e992e
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer. Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution. See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
...@@ -27,6 +27,29 @@ ...@@ -27,6 +27,29 @@
#include "model.h" #include "model.h"
#include "map.h" #include "map.h"
#ifdef BIOEM_USE_NVTX
#include "nvToolsExt.h"
const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
const int num_colors = sizeof(colors)/sizeof(colors[0]);
#define cuda_custom_timeslot(name,cid) { \
int color_id = cid; \
color_id = color_id%num_colors;\
nvtxEventAttributes_t eventAttrib = {0}; \
eventAttrib.version = NVTX_VERSION; \
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define cuda_custom_timeslot_end nvtxRangePop();
#else
#define cuda_custom_timeslot(name,cid)
#define cuda_custom_timeslot_end
#endif
#include "bioem_algorithm.h" #include "bioem_algorithm.h"
using namespace boost; using namespace boost;
...@@ -440,6 +463,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT) ...@@ -440,6 +463,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********************* and turns projection into Fourier space ************************ // ********************* and turns projection into Fourier space ************************
// ************************************************************************************** // **************************************************************************************
cuda_custom_timeslot("Projection", 0);
myfloat3_t RotatedPointsModel[Model.nPointsModel]; myfloat3_t RotatedPointsModel[Model.nPointsModel];
myfloat_t rotmat[3][3]; myfloat_t rotmat[3][3];
myfloat_t alpha, gam, beta; myfloat_t alpha, gam, beta;
...@@ -516,6 +541,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT) ...@@ -516,6 +541,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********** Omp Critical is necessary with FFTW******* // ********** Omp Critical is necessary with FFTW*******
myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT); myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT);
cuda_custom_timeslot_end;
return(0); return(0);
} }
...@@ -527,6 +554,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj ...@@ -527,6 +554,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
// *************** and Backtransforming it to real Space ******************************** // *************** and Backtransforming it to real Space ********************************
// ************************************************************************************** // **************************************************************************************
cuda_custom_timeslot("Convolution", 1);
mycomplex_t* tmp = localCCT[omp_get_thread_num()]; mycomplex_t* tmp = localCCT[omp_get_thread_num()];
// **** Multiplying FFTmap with corresponding kernel **** // **** Multiplying FFTmap with corresponding kernel ****
...@@ -559,6 +588,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj ...@@ -559,6 +588,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
sumC = sumC / norm2; sumC = sumC / norm2;
sumsquareC = sumsquareC / norm4; sumsquareC = sumsquareC / norm4;
cuda_custom_timeslot_end;
return(0); return(0);
} }
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer. Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution. See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
...@@ -176,7 +176,12 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map ...@@ -176,7 +176,12 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
if (FFTAlgo) if (FFTAlgo)
{ {
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t)); memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0])); checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0));
}
if (GPUDualStream) if (GPUDualStream)
{ {
checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0])); checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
...@@ -310,6 +315,11 @@ int bioem_cuda::deviceInit() ...@@ -310,6 +315,11 @@ int bioem_cuda::deviceInit()
checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i])); checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize)); checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
} }
if (GPUAsync)
{
checkCudaErrors(cudaStreamCreate(&cudaStream[2]));
checkCudaErrors(cudaEventCreate(&cudaEvent[2]));
}
if (FFTAlgo) if (FFTAlgo)
{ {
...@@ -358,6 +368,12 @@ int bioem_cuda::deviceExit() ...@@ -358,6 +368,12 @@ int bioem_cuda::deviceExit()
{ {
cudaFree(pRefMap_device_Mod); cudaFree(pRefMap_device_Mod);
} }
if (GPUAsync)
{
cudaStreamDestroy(cudaStream[2]);
cudaEventDestroy(cudaEvent[2]);
}
delete gpumap; delete gpumap;
cudaThreadExit(); cudaThreadExit();
......
...@@ -29,8 +29,8 @@ protected: ...@@ -29,8 +29,8 @@ protected:
int deviceInitialized; int deviceInitialized;
cudaStream_t cudaStream[2]; cudaStream_t cudaStream[3];
cudaEvent_t cudaEvent[2]; cudaEvent_t cudaEvent[3];
cudaEvent_t cudaFFTEvent[2]; cudaEvent_t cudaFFTEvent[2];
bioem_RefMap_Mod* pRefMap_device_Mod; bioem_RefMap_Mod* pRefMap_device_Mod;
bioem_RefMap* gpumap; bioem_RefMap* gpumap;
......
...@@ -69,6 +69,7 @@ struct myfloat3_t ...@@ -69,6 +69,7 @@ struct myfloat3_t
#define CUDA_BLOCK_COUNT 1024 * 16 #define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024 #define CUDA_MAX_SHIFT_REDUCE 1024
#define CUDA_FFTS_AT_ONCE 1024 #define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
static inline void* mallocchk(size_t size) static inline void* mallocchk(size_t size)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment