Commit adefd46f authored by Luka Stanisic's avatar Luka Stanisic

offloading everything related to Autotuning to another class, making the code cleaner

parent ddd6cbd9
......@@ -37,7 +37,7 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}")
endif()
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp")
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp")
###Find Required Packages
find_package(PkgConfig)
......
#include "autotuner.h"
void Autotuner::Reset()
{
stopTuning = false;
workload = 100;
best_time = 0.;
best_workload = 0;
a = 1;
b = 50;
c = 100;
x = 50;
limit = 1;
fb = 0.;
fx = 0.;
if (algo == 3) workload = 50;
}
bool Autotuner::Needed(int iteration)
{
if (stopTuning) return false;
switch (algo)
{
case 1:
case 3:
return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
default: /* Should never happen */;
}
return false;
}
bool Autotuner::Finished()
{
switch (algo)
{
case 1:
if (workload < 30)
{
workload = best_workload;
return stopTuning = true;
}
break;
case 2:
if (best_workload != 0) return stopTuning = true;
break;
case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
break;
default: /* Should never happen */;
}
return false;
}
void Autotuner::Tune(double compTime)
{
switch (algo)
{
case 1: AlgoSimple(compTime); break;
case 2: AlgoRatio(compTime); break;
case 3: AlgoBisection(compTime); break;
default: /* Should never happen */;
}
}
void Autotuner::AlgoSimple(double compTime)
{
if (best_time == 0. || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
}
void Autotuner::AlgoRatio(double compTime)
{
if (best_time == 0.)
{
best_time = compTime;
workload = 1;
}
else
{
best_workload = (int) 100 * (compTime / (best_time + compTime));
workload = best_workload;
}
}
void Autotuner::AlgoBisection(double compTime)
{
if (fb == 0.)
{
fb = compTime;
x = 75;
workload = x;
return;
}
fx = compTime;
if (fx < fb)
{
if (x < b)
c = b;
else
a = b;
b = x;
fb = fx;
}
else
{
if (x < b)
a = x;
else
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
workload = x;
}
......@@ -14,12 +14,6 @@
#ifdef WITH_MPI
#include <mpi.h>
/* Recalibrate every X projections */
#define RECALIB_FACTOR 200
/* After how many comparison iterations, comparison duration becomes stable */
#define FIRST_STABLE 7
#define STABLE_ITERATION(i) (i % (FIRST_STABLE + 1) == FIRST_STABLE)
#define MPI_CHK(expr) \
if (expr != MPI_SUCCESS) \
{ \
......@@ -47,6 +41,7 @@
#include <fftw3.h>
#include <math.h>
#include "timer.h"
#include "autotuner.h"
#include "param.h"
#include "bioem.h"
......@@ -103,7 +98,13 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = getenv("BIOEM_AUTOTUNING") == NULL ? 0 : atoi(getenv("BIOEM_AUTOTUNING"));
Autotuning = false;
if (getenv("GPU") && atoi(getenv("GPU")))
if (!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))
if (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE))
{
Autotuning = true;
}
}
bioem::~bioem()
......@@ -532,17 +533,12 @@ int bioem::run()
HighResTimer timer, timer2;
/* This variables are used for Autotuning */
double best_time = 0;
int workload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
int best_workload = workload;
bool stopTuning=false;
int a=1, b=50, c=100, x=75, limit=1;
double fb=0., fx=0.;
if (Autotuning == 3)
/* Autotuning */
Autotuner aut;
if (Autotuning)
{
workload=b;
rebalance(b);
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -576,15 +572,12 @@ int bioem::run()
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
{
/* Recalibrate if needed */
if (((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && ((iTmpEnd - iOrient) > RECALIB_FACTOR) && (Autotuning == 3) )
{
a=1, b=50, c=100, x=75, limit=1;
fb=0., fx=0.;
workload=b;
rebalance(b);
stopTuning=false;
if (Autotuning && ((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && ((iTmpEnd - iOrient) > RECALIB_FACTOR))
{
aut.Reset();
rebalance(aut.Workload());
}
mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
// ***************************************************************************************
......@@ -598,8 +591,7 @@ int bioem::run()
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
if (Autotuning && !stopTuning) timer.ResetStart();
if (DebugOutput >= 2) timer.ResetStart();
if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
amp=param.CtfParam[iConv].pos[0];
......@@ -622,90 +614,15 @@ int bioem::run()
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., workload, mpi_rank);
}
if (Autotuning == 1 && !stopTuning && STABLE_ITERATION(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
if (best_time == 0 || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
if (workload < 30)
{
stopTuning=true;
workload=best_workload;
}
rebalance(workload);
}
if (Autotuning == 2 && !stopTuning && (iConv == 3 || iConv == 7))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
if (iConv == 3)
{
best_time = compTime;
workload = 1;
}
else if (iConv == 7)
{
workload = (int) 100 * ( compTime / (best_time+compTime) );
if (DebugOutput >= 2)
{
printf("\t\tComparison on GPU only time: %.6f\n", best_time);
printf("\t\tComparison on CPU only time: %.6f\n", compTime);
printf("\t\tOptimal GPU workload: %d%%\n", workload);
}
stopTuning=true;
}
rebalance(workload);
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning == 3 && !stopTuning && STABLE_ITERATION(iConv))
if (Autotuning && aut.Needed(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
if (((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && (iConv == FIRST_STABLE))
{
fb = compTime;
x = 75;
}
else
{
fx = compTime;
if (fx < fb)
{
if (x < b)
c = b;
else
a = b;
b = x;
fb = fx;
}
else
{
if (x < b)
a = x;
else
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
}
if ((c - b == limit) && (b - a == limit))
{
stopTuning=true;
if (DebugOutput >= 2)
{
printf("\t\tOptimal GPU workload %d%% (rank %d)\n", workload, mpi_rank);
}
}
workload=x;
rebalance(x);
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
}
}
if (DebugOutput >= 1)
......
......@@ -137,6 +137,7 @@ bioem_cuda::bioem_cuda()
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
if (GPUWorkload == -1) GPUWorkload = 100;
GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
}
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#ifndef AUTOTUNER_H
#define AUTOTUNER_H
class Autotuner {
public:
Autotuner() {stopTuning = true;}
/* Setting variables to initial values */
inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
/* Resetting variables to initial values */
void Reset();
/* Check if autotuning is needed, depending on which comparison is finished */
bool Needed(int iteration);
/* Check if optimal workload value has been computed */
bool Finished();
/* Set a new workload value to test, depending on the algorithm */
void Tune(double compTime);
/* Return workload value */
inline int Workload() {return workload;}
private:
int algo;
int stable;
bool stopTuning;
int workload;
/* Variables needed for AlgoSimple and AlgoRatio */
double best_time;
int best_workload;
/* Variables needed for AlgoBisection */
int a;
int b;
int c;
int x;
int limit;
double fb, fx;
/* Autotuning algorithms */
void AlgoSimple(double compTime);
void AlgoRatio(double compTime);
void AlgoBisection(double compTime);
};
#endif
......@@ -72,7 +72,7 @@ protected:
int FFTAlgo; //Use the FFT Algorithm (Default 1)
int DebugOutput; //Debug Output Level (Default 2)
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1)
int Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
};
#endif
......@@ -91,6 +91,18 @@ struct myfloat3_t
#define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
/* Autotuning
Autotuning algorithms:
1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing.
2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed.
3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found.
*/
#define AUTOTUNING_ALGORITHM 3
/* Recalibrate every X projections */
#define RECALIB_FACTOR 200
/* After how many comparison iterations, comparison duration becomes stable */
#define FIRST_STABLE 7
static inline void* mallocchk(size_t size)
{
void* ptr = malloc(size);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment