Commit 02123bb1 authored by David Rohr's avatar David Rohr

improve async cuda transfer, improve cuda tracing

parent ba4e992e
......@@ -2,7 +2,7 @@
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
......@@ -27,6 +27,29 @@
#include "model.h"
#include "map.h"
#ifdef BIOEM_USE_NVTX
#include "nvToolsExt.h"
const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
const int num_colors = sizeof(colors)/sizeof(colors[0]);
#define cuda_custom_timeslot(name,cid) { \
int color_id = cid; \
color_id = color_id%num_colors;\
nvtxEventAttributes_t eventAttrib = {0}; \
eventAttrib.version = NVTX_VERSION; \
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define cuda_custom_timeslot_end nvtxRangePop();
#else
#define cuda_custom_timeslot(name,cid)
#define cuda_custom_timeslot_end
#endif
#include "bioem_algorithm.h"
using namespace boost;
......@@ -440,6 +463,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********************* and turns projection into Fourier space ************************
// **************************************************************************************
cuda_custom_timeslot("Projection", 0);
myfloat3_t RotatedPointsModel[Model.nPointsModel];
myfloat_t rotmat[3][3];
myfloat_t alpha, gam, beta;
......@@ -516,6 +541,8 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********** Omp Critical is necessary with FFTW*******
myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT);
cuda_custom_timeslot_end;
return(0);
}
......@@ -527,6 +554,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
// *************** and Backtransforming it to real Space ********************************
// **************************************************************************************
cuda_custom_timeslot("Convolution", 1);
mycomplex_t* tmp = localCCT[omp_get_thread_num()];
// **** Multiplying FFTmap with corresponding kernel ****
......@@ -559,6 +588,8 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
sumC = sumC / norm2;
sumsquareC = sumsquareC / norm4;
cuda_custom_timeslot_end;
return(0);
}
......
......@@ -2,7 +2,7 @@
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2014 Pilar Cossio, David Rohr and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
......@@ -176,7 +176,12 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
if (FFTAlgo)
{
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0]));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0));
}
if (GPUDualStream)
{
checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
......@@ -310,6 +315,11 @@ int bioem_cuda::deviceInit()
checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
}
if (GPUAsync)
{
checkCudaErrors(cudaStreamCreate(&cudaStream[2]));
checkCudaErrors(cudaEventCreate(&cudaEvent[2]));
}
if (FFTAlgo)
{
......@@ -358,6 +368,12 @@ int bioem_cuda::deviceExit()
{
cudaFree(pRefMap_device_Mod);
}
if (GPUAsync)
{
cudaStreamDestroy(cudaStream[2]);
cudaEventDestroy(cudaEvent[2]);
}
delete gpumap;
cudaThreadExit();
......
......@@ -29,8 +29,8 @@ protected:
int deviceInitialized;
cudaStream_t cudaStream[2];
cudaEvent_t cudaEvent[2];
cudaStream_t cudaStream[3];
cudaEvent_t cudaEvent[3];
cudaEvent_t cudaFFTEvent[2];
bioem_RefMap_Mod* pRefMap_device_Mod;
bioem_RefMap* gpumap;
......
......@@ -69,6 +69,7 @@ struct myfloat3_t
#define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024
#define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
static inline void* mallocchk(size_t size)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment