Commit d48dcc78 authored by Pilar Cossio's avatar Pilar Cossio
Browse files

Merge branch 'BioEM-1.0' into 'master'

profiling: improving NVTX profiling CPU+GPU execution

See merge request !5
parents 55af9b02 254d53db
Pipeline #15705 passed with stage
in 39 seconds
......@@ -20,6 +20,7 @@ option (USE_OPENMP "Build BioEM with OpenMP support" ON)
option (USE_MPI "Build BioEM with MPI support" ON)
option (PRINT_CMAKE_VARIABLES "List all CMAKE Variables" OFF)
option (CUDA_FORCE_GCC "Force GCC as host compiler for CUDA part (If standard host compiler is incompatible with CUDA)" ON)
option (USE_NVTX "Build BioEM with additional NVTX information" OFF)
###Set up general variables
......@@ -138,6 +139,16 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-vla -Wno-long-long -Wall -pedantic")
endif()
###Enable CUDA debugging with NVTX
if (USE_NVTX)
if (CUDA_FOUND)
set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARY} "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so")
add_definitions(-DBIOEM_USE_NVTX)
else()
message(FATAL_ERROR "Cannot use NVTX if CUDA is not found")
endif()
endif()
###Add Libraries
if (CUDA_FOUND)
cuda_add_cufft_to_target(bioEM)
......
......@@ -52,8 +52,12 @@
const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
const int num_colors = sizeof(colors)/sizeof(colors[0]);
enum myColor { COLOR_PROJECTION, COLOR_CONVOLUTION, COLOR_COMPARISON, COLOR_WORKLOAD, COLOR_INIT };
#define cuda_custom_timeslot(name,cid) { \
// Projection number is stored in category attribute
// Convolution number is stored in payload attribute
#define cuda_custom_timeslot(name,iMap,iConv,cid) { \
int color_id = cid; \
color_id = color_id%num_colors; \
nvtxEventAttributes_t eventAttrib = {0}; \
......@@ -61,13 +65,16 @@ const int num_colors = sizeof(colors)/sizeof(colors[0]);
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.category = iMap; \
eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64; \
eventAttrib.payload.llValue = iConv; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define cuda_custom_timeslot_end nvtxRangePop();
#else
#define cuda_custom_timeslot(name,cid)
#define cuda_custom_timeslot(name,iMap,iConv,cid)
#define cuda_custom_timeslot_end
#endif
......@@ -95,7 +102,7 @@ ostream& operator<<(ostream& os, const vector<T>& v)
bioem::bioem()
{
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 0 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = false;
}
......@@ -466,7 +473,7 @@ int bioem::run()
// **** If we want to control the number of threads -> omp_set_num_threads(XX); ******
// ****************** Declarying class of Probability Pointer *************************
cuda_custom_timeslot("Initialization", -1, -1, COLOR_INIT);
if (mpi_rank == 0) printf("\tInitializing Probabilities\n");
// Contros for MPI
......@@ -534,7 +541,8 @@ int bioem::run()
proj_mapsFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * ProjMapSize * nProjectionsAtOnce);
conv_mapFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
if (!FFTAlgo) conv_map = (myfloat_t*) myfftw_malloc(sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
cuda_custom_timeslot_end; //Ending initialization
HighResTimer timer, timer2;
......@@ -543,7 +551,7 @@ int bioem::run()
if (Autotuning)
{
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
rebalanceWrapper(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -554,6 +562,9 @@ int bioem::run()
int iOrientEnd = (int) ((long long int) (mpi_rank + 1) * param.nTotGridAngles / mpi_size);
if (iOrientEnd > param.nTotGridAngles) iOrientEnd = param.nTotGridAngles;
/* Vectors for computing statistic on different parts of the code */
TimeStat ts((iOrientEnd - iOrientStart), param.nTotCTFs);
if (DebugOutput >= 1) ts.InitTimeStat(4);
// **************************Loop Over orientations***************************************
......@@ -561,8 +572,11 @@ int bioem::run()
{
// ***************************************************************************************
// ***** Creating Projection for given orientation and transforming to Fourier space *****
if (DebugOutput >= 1) timer2.ResetStart();
if (DebugOutput >= 2) timer.ResetStart();
if (DebugOutput >= 1)
{
timer2.ResetStart();
timer.ResetStart();
}
int iTmpEnd = std::min(iOrientEnd, iOrientAtOnce + nProjectionsAtOnce);
// **************************Parallel orientations for projections at once***************
......@@ -572,13 +586,17 @@ int bioem::run()
{
createProjection(iOrient, &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]);
}
if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
if (DebugOutput >= 1)
{
ts.time = timer.GetCurrentElapsedTime();
ts.Add(TS_PROJECTION);
if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, ts.time, mpi_rank);
}
/* Recalibrate if needed */
if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
{
aut.Reset();
rebalance(aut.Workload());
rebalanceWrapper(aut.Workload());
}
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
......@@ -591,52 +609,69 @@ int bioem::run()
for (int iConv = 0; iConv < param.nTotCTFs; iConv++)
{
// *** Calculating convolutions of projection map and crosscorrelations ***
if (DebugOutput >= 2) timer.ResetStart();
if (DebugOutput >= 1) timer.ResetStart();
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
if (DebugOutput >= 1)
{
ts.time = timer.GetCurrentElapsedTime();
ts.Add(TS_CONVOLUTION);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, ts.time, mpi_rank);
}
if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
if ((DebugOutput >= 1) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
amp=param.CtfParam[iConv].pos[0];
pha=param.CtfParam[iConv].pos[1];
env=param.CtfParam[iConv].pos[2];
amp=param.CtfParam[iConv].pos[0];
pha=param.CtfParam[iConv].pos[1];
env=param.CtfParam[iConv].pos[2];
// ******************Internal loop over Reference images CUDA or OpenMP******************
// *** Comparing each calculated convoluted map with all experimental maps ***
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
ts.time = 0.;
if (DebugOutput >= 1)
{
ts.time = timer.GetCurrentElapsedTime();
ts.Add(TS_COMPARISON);
}
if (DebugOutput >= 2)
{
compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / ts.time;
const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / ts.time;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / ts.time;
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, ts.time, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, ts.time, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning && aut.Needed(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
if (ts.time == 0.) ts.time = timer.GetCurrentElapsedTime();
aut.Tune(ts.time);
if (aut.Finished() && DebugOutput >= 1) printf("\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalanceWrapper(aut.Workload());
}
}
if (DebugOutput >= 1)
{
printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank);
ts.time = timer2.GetCurrentElapsedTime();
ts.Add(TS_TPROJECTION);
printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, ts.time, mpi_rank);
timer2.ResetStart();
}
}
}
/* Statistical summary on different parts of the code */
if (DebugOutput >= 1)
{
ts.PrintTimeStat(mpi_rank);
ts.EmptyTimeStat();
}
//deallocating fftw_complex vector
myfftw_free(proj_mapsFFT);
myfftw_free(conv_mapFFT);
......@@ -1026,6 +1061,7 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha,
//***************************************************************************************
//***** BioEM routine for comparing reference maps to convoluted maps *****
//***************************************************************************************
cuda_custom_timeslot("Comparison", iOrient, iConv, COLOR_COMPARISON);
if (FFTAlgo)
{
//With FFT Algorithm
......@@ -1045,6 +1081,8 @@ int bioem::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha,
compareRefMapShifted < -1 > (iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, conv_map, pProb, param.param_device, RefMap);
}
}
cuda_custom_timeslot_end;
return(0);
}
......@@ -1096,7 +1134,7 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
// ********************* and turns projection into Fourier space ************************
// **************************************************************************************
cuda_custom_timeslot("Projection", 0);
cuda_custom_timeslot("Projection", iMap, 0, COLOR_PROJECTION);
myfloat3_t RotatedPointsModel[Model.nPointsModel];
myfloat_t rotmat[3][3];
......@@ -1297,7 +1335,7 @@ int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj
// **************************************************************************************
cuda_custom_timeslot("Convolution", 1);
cuda_custom_timeslot("Convolution", iMap, iConv, COLOR_CONVOLUTION);
mycomplex_t* tmp = param.fft_scratch_complex[omp_get_thread_num()];
......@@ -1455,4 +1493,11 @@ void bioem::free_device_host(void* ptr)
free(ptr);
}
void bioem::rebalanceWrapper(int workload)
{
cuda_custom_timeslot("Rebalance workload", -1, workload, COLOR_WORKLOAD);
rebalance(workload);
cuda_custom_timeslot_end;
}
void bioem::rebalance(int workload) {}
......@@ -43,6 +43,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
void rebalanceWrapper(int workload); //Rebalance wrapper
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......
......@@ -11,6 +11,15 @@
#ifndef TIMER_H
#define TIMER_H
#include <stdio.h>
#include <string>
#include <numeric>
#include <vector>
#include <algorithm>
#include <cmath>
using namespace std;
class HighResTimer {
public:
......@@ -30,6 +39,40 @@ private:
double ElapsedTime;
double StartTime;
int running;
};
};
/* Structure for saving a vector of timings */
typedef struct _TimeLog {
vector<double> vec;
double sum;
double stdev;
string name;
}TimeLog;
enum TS_NAMES{TS_TPROJECTION, TS_PROJECTION, TS_CONVOLUTION, TS_COMPARISON};
/* Structure for saving timings of different parts of code and doing basic statistics on them */
class TimeStat {
public:
TimeStat(int Angles, int CTFs) : time(0),tl(NULL) {angles = Angles; ctfs = CTFs;};
~TimeStat() {EmptyTimeStat();};
void InitTimeLog(int log, int size, string s);
void InitTimeStat(int nlogs);
void EmptyTimeStat();
void inline Add(int log) {tl[log].vec.push_back(time);};
void ComputeTimeStat();
void PrintTimeStat(int mpi_rank);
/* Variable for storing times during the execution */
double time;
private:
TimeLog* tl;
int total_logs;
int angles;
int ctfs;
};
#endif
......@@ -352,7 +352,7 @@ void bioem_Probability::init(size_t maps, size_t angles, size_t cc, bioem& bio)
nAngles = angles;
nCC = cc;
ptr = bio.malloc_device_host(get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC));
cout << "Allocation #Maps " << maps << " #Angles " << angles << " #cross.cor " << cc << "\n";
if (bio.DebugOutput >= 1) cout << "Allocation #Maps " << maps << " #Angles " << angles << " #cross.cor " << cc << "\n";
//<< " == " << get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC)<< "\n";
set_pointers();
}
......
......@@ -90,4 +90,59 @@ double HighResTimer::GetFrequency()
#endif
}
double HighResTimer::Frequency = HighResTimer::GetFrequency();
\ No newline at end of file
double HighResTimer::Frequency = HighResTimer::GetFrequency();
void TimeStat::InitTimeLog(int log, int size, string s)
{
tl[log].vec.reserve(size);
tl[log].name = s;
tl[log].sum = 0.;
tl[log].stdev = 0.;
}
void TimeStat::InitTimeStat(int nlogs)
{
total_logs = nlogs;
tl = new TimeLog[total_logs];
InitTimeLog(TS_TPROJECTION, angles, "Total time of projection");
InitTimeLog(TS_PROJECTION, angles, "Projection");
InitTimeLog(TS_CONVOLUTION, angles * ctfs, "Convolution");
InitTimeLog(TS_COMPARISON, angles * ctfs, "Comparison");
}
void TimeStat::EmptyTimeStat()
{
if (tl == NULL) return;
delete [ ] tl;
tl = NULL;
time = 0.;
}
void TimeStat::ComputeTimeStat()
{
double mean, sq_sum;
vector<double> diff;
for (int i = 0; i < total_logs; i++)
{
tl[i].sum = std::accumulate(tl[i].vec.begin(), tl[i].vec.end(), 0.0);
mean = tl[i].sum / tl[i].vec.size();
diff.resize(tl[i].vec.size());
std::transform(tl[i].vec.begin(), tl[i].vec.end(), diff.begin(), std::bind2nd(std::minus<double>(), mean));
sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
tl[i].stdev = std::sqrt(sq_sum / tl[i].vec.size());
}
}
void TimeStat::PrintTimeStat(int mpi_rank)
{
ComputeTimeStat();
for (int i = 0; i < total_logs; i++)
{
printf("SUMMARY -> %s: Total %f sec; Mean %f sec; Std.Dev. %f (rank %d)\n", tl[i].name.c_str(), tl[i].sum, tl[i].sum / tl[i].vec.size(), tl[i].stdev, mpi_rank);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment