diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e3cda236a394a5a0b174ace3043a6736b5ba42d..fd731f9a7896cfebf6d49a81eba7bd93c5c78da6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option (USE_OPENMP "Build BioEM with OpenMP support" ON) option (USE_MPI "Build BioEM with MPI support" ON) option (PRINT_CMAKE_VARIABLES "List all CMAKE Variables" OFF) option (CUDA_FORCE_GCC "Force GCC as host compiler for CUDA part (If standard host compiler is incompatible with CUDA)" ON) +option (USE_NVTX "Build BioEM with additional NVTX information" OFF) ###Set up general variables @@ -138,6 +139,16 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-vla -Wno-long-long -Wall -pedantic") endif() +###Enable CUDA debugging with NVTX +if (USE_NVTX) + if (CUDA_FOUND) + set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARY} "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so") + add_definitions(-DBIOEM_USE_NVTX) + else() + message(FATAL_ERROR "Cannot use NVTX if CUDA is not found") + endif() +endif() + ###Add Libraries if (CUDA_FOUND) cuda_add_cufft_to_target(bioEM) diff --git a/bioem.cpp b/bioem.cpp index ddaf52e6147bc2beff195fcbe8c16a90206a8898..93c10e538eaeac2f2c4a7bf0aeb9778c2854d5fc 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -53,8 +53,12 @@ const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; const int num_colors = sizeof(colors)/sizeof(colors[0]); +enum myColor { COLOR_PROJECTION, COLOR_CONVOLUTION, COLOR_COMPARISON, COLOR_WORKLOAD }; -#define cuda_custom_timeslot(name,cid) { \ +// Projection number is stored in category attribute +// Convolution number is stored in payload attribute + +#define cuda_custom_timeslot(name,iMap,iConv,cid) { \ int color_id = cid; \ color_id = color_id%num_colors; \ nvtxEventAttributes_t eventAttrib = {0}; \ @@ -62,13 +66,16 @@ const int num_colors = sizeof(colors)/sizeof(colors[0]); eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ eventAttrib.colorType = NVTX_COLOR_ARGB; \ eventAttrib.color = colors[color_id]; \ + eventAttrib.category = iMap; \ + eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64; \ + eventAttrib.payload.llValue = iConv; \ eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ eventAttrib.message.ascii = name; \ nvtxRangePushEx(&eventAttrib); \ } #define cuda_custom_timeslot_end nvtxRangePop(); #else -#define cuda_custom_timeslot(name,cid) +#define cuda_custom_timeslot(name,iMap,iConv,cid) #define cuda_custom_timeslot_end #endif @@ -544,7 +551,7 @@ int bioem::run() if (Autotuning) { aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE); - rebalance(aut.Workload()); + rebalanceWrapper(aut.Workload()); } if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size); @@ -579,7 +586,7 @@ int bioem::run() if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart)) { aut.Reset(); - rebalance(aut.Workload()); + rebalanceWrapper(aut.Workload()); } for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++) @@ -628,7 +635,7 @@ int bioem::run() if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); aut.Tune(compTime); if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank); - rebalance(aut.Workload()); + rebalanceWrapper(aut.Workload()); } } if (DebugOutput >= 1) @@ -1027,6 +1034,7 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, //*************************************************************************************** //***** BioEM routine for comparing reference maps to convoluted maps ***** //*************************************************************************************** + cuda_custom_timeslot("Comparison", iOrient, iConv, COLOR_COMPARISON); if (FFTAlgo) { //With FFT Algorithm @@ -1046,6 +1054,8 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, compareRefMapShifted < -1 > (iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, conv_map, pProb, param.param_device, RefMap); } } + + cuda_custom_timeslot_end; return(0); } @@ -1097,7 +1107,7 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT) // ********************* and turns projection into Fourier space ************************ // ************************************************************************************** - cuda_custom_timeslot("Projection", 0); + cuda_custom_timeslot("Projection", iMap, 0, COLOR_PROJECTION); myfloat3_t RotatedPointsModel[Model.nPointsModel]; myfloat_t rotmat[3][3]; @@ -1298,7 +1308,7 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj // ************************************************************************************** - cuda_custom_timeslot("Convolution", 1); + cuda_custom_timeslot("Convolution", iMap, iConv, COLOR_CONVOLUTION); mycomplex_t* tmp = param.fft_scratch_complex[omp_get_thread_num()]; @@ -1456,4 +1466,11 @@ void bioem::free_device_host(void* ptr) free(ptr); } +void bioem::rebalanceWrapper(int workload) +{ + cuda_custom_timeslot("Rebalance workload", -1, workload, COLOR_WORKLOAD); + rebalance(workload); + cuda_custom_timeslot_end; +} + void bioem::rebalance(int workload) {} diff --git a/include/bioem.h b/include/bioem.h index 98222791db36eb60d1061d63c74793719364cf24..813acec26c2bcaeaa78459402d1f8eec0d8997c1 100644 --- a/include/bioem.h +++ b/include/bioem.h @@ -43,6 +43,7 @@ public: virtual void* malloc_device_host(size_t size); virtual void free_device_host(void* ptr); virtual void rebalance(int workload); //Rebalance GPUWorkload + void rebalanceWrapper(int workload); //Rebalance wrapper int createProjection(int iMap, mycomplex_t* map); int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);