Commit bad8337a authored by Luka Stanisic's avatar Luka Stanisic

profiling: improving NVTX profiling CPU+GPU execution

To enable it use undisclosed CMake option -DUSE_NVTX=ON (by default it is OFF)
parent dc843f20
......@@ -20,6 +20,7 @@ option (USE_OPENMP "Build BioEM with OpenMP support" ON)
option (USE_MPI "Build BioEM with MPI support" ON)
option (PRINT_CMAKE_VARIABLES "List all CMAKE Variables" OFF)
option (CUDA_FORCE_GCC "Force GCC as host compiler for CUDA part (If standard host compiler is incompatible with CUDA)" ON)
option (USE_NVTX "Build BioEM with additional NVTX information" OFF)
###Set up general variables
......@@ -138,6 +139,16 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-vla -Wno-long-long -Wall -pedantic")
endif()
###Enable CUDA debugging with NVTX
if (USE_NVTX)
if (CUDA_FOUND)
set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARY} "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so")
add_definitions(-DBIOEM_USE_NVTX)
else()
message(FATAL_ERROR "Cannot use NVTX if CUDA is not found")
endif()
endif()
###Add Libraries
if (CUDA_FOUND)
cuda_add_cufft_to_target(bioEM)
......
......@@ -53,8 +53,12 @@
const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
const int num_colors = sizeof(colors)/sizeof(colors[0]);
enum myColor { COLOR_PROJECTION, COLOR_CONVOLUTION, COLOR_COMPARISON, COLOR_WORKLOAD };
#define cuda_custom_timeslot(name,cid) { \
// Projection number is stored in category attribute
// Convolution number is stored in payload attribute
#define cuda_custom_timeslot(name,iMap,iConv,cid) { \
int color_id = cid; \
color_id = color_id%num_colors; \
nvtxEventAttributes_t eventAttrib = {0}; \
......@@ -62,13 +66,16 @@ const int num_colors = sizeof(colors)/sizeof(colors[0]);
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.category = iMap; \
eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64; \
eventAttrib.payload.llValue = iConv; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define cuda_custom_timeslot_end nvtxRangePop();
#else
#define cuda_custom_timeslot(name,cid)
#define cuda_custom_timeslot(name,iMap,iConv,cid)
#define cuda_custom_timeslot_end
#endif
......@@ -544,7 +551,7 @@ int bioem::run()
if (Autotuning)
{
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
rebalanceWrapper(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -579,7 +586,7 @@ int bioem::run()
if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
{
aut.Reset();
rebalance(aut.Workload());
rebalanceWrapper(aut.Workload());
}
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
......@@ -628,7 +635,7 @@ int bioem::run()
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
rebalanceWrapper(aut.Workload());
}
}
if (DebugOutput >= 1)
......@@ -1027,6 +1034,7 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha,
//***************************************************************************************
//***** BioEM routine for comparing reference maps to convoluted maps *****
//***************************************************************************************
cuda_custom_timeslot("Comparison", iOrient, iConv, COLOR_COMPARISON);
if (FFTAlgo)
{
//With FFT Algorithm
......@@ -1046,6 +1054,8 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha,
compareRefMapShifted < -1 > (iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, conv_map, pProb, param.param_device, RefMap);
}
}
cuda_custom_timeslot_end;
return(0);
}
......@@ -1097,7 +1107,7 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********************* and turns projection into Fourier space ************************
// **************************************************************************************
cuda_custom_timeslot("Projection", 0);
cuda_custom_timeslot("Projection", iMap, 0, COLOR_PROJECTION);
myfloat3_t RotatedPointsModel[Model.nPointsModel];
myfloat_t rotmat[3][3];
......@@ -1298,7 +1308,7 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
// **************************************************************************************
cuda_custom_timeslot("Convolution", 1);
cuda_custom_timeslot("Convolution", iMap, iConv, COLOR_CONVOLUTION);
mycomplex_t* tmp = param.fft_scratch_complex[omp_get_thread_num()];
......@@ -1456,4 +1466,11 @@ void bioem::free_device_host(void* ptr)
free(ptr);
}
void bioem::rebalanceWrapper(int workload)
{
cuda_custom_timeslot("Rebalance workload", -1, workload, COLOR_WORKLOAD);
rebalance(workload);
cuda_custom_timeslot_end;
}
void bioem::rebalance(int workload) {}
......@@ -43,6 +43,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
void rebalanceWrapper(int workload); //Rebalance wrapper
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment