Commit d48dcc78 authored by Pilar Cossio's avatar Pilar Cossio

Merge branch 'BioEM-1.0' into 'master'

profiling: improving NVTX profiling CPU+GPU execution

See merge request !5
parents 55af9b02 254d53db
Pipeline #15705 passed with stage
in 39 seconds
......@@ -20,6 +20,7 @@ option (USE_OPENMP "Build BioEM with OpenMP support" ON)
option (USE_MPI "Build BioEM with MPI support" ON)
option (PRINT_CMAKE_VARIABLES "List all CMAKE Variables" OFF)
option (CUDA_FORCE_GCC "Force GCC as host compiler for CUDA part (If standard host compiler is incompatible with CUDA)" ON)
option (USE_NVTX "Build BioEM with additional NVTX information" OFF)
###Set up general variables
......@@ -138,6 +139,16 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-vla -Wno-long-long -Wall -pedantic")
endif()
###Enable CUDA debugging with NVTX
if (USE_NVTX)
if (CUDA_FOUND)
set(CUDA_CUDA_LIBRARY ${CUDA_CUDA_LIBRARY} "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so")
add_definitions(-DBIOEM_USE_NVTX)
else()
message(FATAL_ERROR "Cannot use NVTX if CUDA is not found")
endif()
endif()
###Add Libraries
if (CUDA_FOUND)
cuda_add_cufft_to_target(bioEM)
......
This diff is collapsed.
......@@ -43,6 +43,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
void rebalanceWrapper(int workload); //Rebalance wrapper
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......
......@@ -11,6 +11,15 @@
#ifndef TIMER_H
#define TIMER_H
#include <stdio.h>
#include <string>
#include <numeric>
#include <vector>
#include <algorithm>
#include <cmath>
using namespace std;
class HighResTimer {
public:
......@@ -30,6 +39,40 @@ private:
double ElapsedTime;
double StartTime;
int running;
};
};
/* Structure for saving a vector of timings */
typedef struct _TimeLog {
vector<double> vec;
double sum;
double stdev;
string name;
}TimeLog;
enum TS_NAMES{TS_TPROJECTION, TS_PROJECTION, TS_CONVOLUTION, TS_COMPARISON};
/* Structure for saving timings of different parts of code and doing basic statistics on them */
class TimeStat {
public:
TimeStat(int Angles, int CTFs) : time(0),tl(NULL) {angles = Angles; ctfs = CTFs;};
~TimeStat() {EmptyTimeStat();};
void InitTimeLog(int log, int size, string s);
void InitTimeStat(int nlogs);
void EmptyTimeStat();
void inline Add(int log) {tl[log].vec.push_back(time);};
void ComputeTimeStat();
void PrintTimeStat(int mpi_rank);
/* Variable for storing times during the execution */
double time;
private:
TimeLog* tl;
int total_logs;
int angles;
int ctfs;
};
#endif
......@@ -352,7 +352,7 @@ void bioem_Probability::init(size_t maps, size_t angles, size_t cc, bioem& bio)
nAngles = angles;
nCC = cc;
ptr = bio.malloc_device_host(get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC));
cout << "Allocation #Maps " << maps << " #Angles " << angles << " #cross.cor " << cc << "\n";
if (bio.DebugOutput >= 1) cout << "Allocation #Maps " << maps << " #Angles " << angles << " #cross.cor " << cc << "\n";
//<< " == " << get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC)<< "\n";
set_pointers();
}
......
......@@ -90,4 +90,59 @@ double HighResTimer::GetFrequency()
#endif
}
double HighResTimer::Frequency = HighResTimer::GetFrequency();
\ No newline at end of file
double HighResTimer::Frequency = HighResTimer::GetFrequency();
void TimeStat::InitTimeLog(int log, int size, string s)
{
tl[log].vec.reserve(size);
tl[log].name = s;
tl[log].sum = 0.;
tl[log].stdev = 0.;
}
void TimeStat::InitTimeStat(int nlogs)
{
total_logs = nlogs;
tl = new TimeLog[total_logs];
InitTimeLog(TS_TPROJECTION, angles, "Total time of projection");
InitTimeLog(TS_PROJECTION, angles, "Projection");
InitTimeLog(TS_CONVOLUTION, angles * ctfs, "Convolution");
InitTimeLog(TS_COMPARISON, angles * ctfs, "Comparison");
}
void TimeStat::EmptyTimeStat()
{
if (tl == NULL) return;
delete [ ] tl;
tl = NULL;
time = 0.;
}
void TimeStat::ComputeTimeStat()
{
double mean, sq_sum;
vector<double> diff;
for (int i = 0; i < total_logs; i++)
{
tl[i].sum = std::accumulate(tl[i].vec.begin(), tl[i].vec.end(), 0.0);
mean = tl[i].sum / tl[i].vec.size();
diff.resize(tl[i].vec.size());
std::transform(tl[i].vec.begin(), tl[i].vec.end(), diff.begin(), std::bind2nd(std::minus<double>(), mean));
sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
tl[i].stdev = std::sqrt(sq_sum / tl[i].vec.size());
}
}
void TimeStat::PrintTimeStat(int mpi_rank)
{
ComputeTimeStat();
for (int i = 0; i < total_logs; i++)
{
printf("SUMMARY -> %s: Total %f sec; Mean %f sec; Std.Dev. %f (rank %d)\n", tl[i].name.c_str(), tl[i].sum, tl[i].sum / tl[i].vec.size(), tl[i].stdev, mpi_rank);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment