Commit dc843f20 authored by Luka Stanisic's avatar Luka Stanisic

adding GPUWorkload autotuning

parent 4153c48d
......@@ -37,7 +37,7 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}")
endif()
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp")
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp")
###Find Required Packages
find_package(PkgConfig)
......
#include "autotuner.h"
void Autotuner::Reset()
{
stopTuning = false;
workload = 100;
best_time = 0.;
best_workload = 0;
a = 1;
b = 50;
c = 100;
x = 50;
limit = 1;
fb = 0.;
fx = 0.;
if (algo == 3) workload = 50;
}
bool Autotuner::Needed(int iteration)
{
if (stopTuning) return false;
switch (algo)
{
case 1:
case 3:
return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
default: /* Should never happen */;
}
return false;
}
bool Autotuner::Finished()
{
switch (algo)
{
case 1:
if (workload < 30)
{
workload = best_workload;
return stopTuning = true;
}
break;
case 2:
if (best_workload != 0) return stopTuning = true;
break;
case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
break;
default: /* Should never happen */;
}
return false;
}
void Autotuner::Tune(double compTime)
{
switch (algo)
{
case 1: AlgoSimple(compTime); break;
case 2: AlgoRatio(compTime); break;
case 3: AlgoBisection(compTime); break;
default: /* Should never happen */;
}
}
void Autotuner::AlgoSimple(double compTime)
{
if (best_time == 0. || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
}
void Autotuner::AlgoRatio(double compTime)
{
if (best_time == 0.)
{
best_time = compTime;
workload = 1;
}
else
{
best_workload = (int) 100 * (compTime / (best_time + compTime));
workload = best_workload;
}
}
void Autotuner::AlgoBisection(double compTime)
{
if (fb == 0.)
{
fb = compTime;
x = 75;
workload = x;
return;
}
fx = compTime;
if (fx < fb)
{
if (x < b)
c = b;
else
a = b;
b = x;
fb = fx;
}
else
{
if (x < b)
a = x;
else
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
workload = x;
}
......@@ -41,6 +41,7 @@
#include <fftw3.h>
#include <math.h>
#include "timer.h"
#include "autotuner.h"
#include "param.h"
#include "bioem.h"
......@@ -97,6 +98,7 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = false;
}
bioem::~bioem()
......@@ -363,6 +365,19 @@ int bioem::configure(int ac, char* av[])
printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
// ****************** For autotuning **********************
if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
{
Autotuning = true;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
}
else
{
Autotuning = false;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n");
}
// ****************** Initializing pointers *********************
deviceInit();
......@@ -524,6 +539,14 @@ int bioem::run()
HighResTimer timer, timer2;
/* Autotuning */
Autotuner aut;
if (Autotuning)
{
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -552,6 +575,13 @@ int bioem::run()
}
if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
/* Recalibrate if needed */
if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
{
aut.Reset();
rebalance(aut.Workload());
}
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
{
mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
......@@ -567,8 +597,7 @@ int bioem::run()
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
if (DebugOutput >= 2) timer.ResetStart();
if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
amp=param.CtfParam[iConv].pos[0];
......@@ -580,9 +609,10 @@ int bioem::run()
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
if (DebugOutput >= 2)
{
const double compTime = timer.GetCurrentElapsedTime();
compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
......@@ -590,7 +620,15 @@ int bioem::run()
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning && aut.Needed(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
}
}
if (DebugOutput >= 1)
......@@ -1417,3 +1455,5 @@ void bioem::free_device_host(void* ptr)
{
free(ptr);
}
void bioem::rebalance(int workload) {}
......@@ -137,6 +137,7 @@ bioem_cuda::bioem_cuda()
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
if (GPUWorkload == -1) GPUWorkload = 100;
GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
}
......@@ -699,6 +700,23 @@ void bioem_cuda::free_device_host(void* ptr)
cudaFreeHost(ptr);
}
void bioem_cuda::rebalance(int workload)
{
if ((workload < 0) || (workload > 100) || (workload == GPUWorkload)) return;
deviceFinishRun();
if (DebugOutput >= 2)
{
printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank);
}
GPUWorkload = workload;
maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
deviceStartRun();
}
bioem* bioem_cuda_create()
{
int count;
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#ifndef AUTOTUNER_H
#define AUTOTUNER_H
class Autotuner {
public:
Autotuner() {stopTuning = true;}
/* Setting variables to initial values */
inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
/* Resetting variables to initial values */
void Reset();
/* Check if autotuning is needed, depending on which comparison is finished */
bool Needed(int iteration);
/* Check if optimal workload value has been computed */
bool Finished();
/* Set a new workload value to test, depending on the algorithm */
void Tune(double compTime);
/* Return workload value */
inline int Workload() {return workload;}
private:
int algo;
int stable;
bool stopTuning;
int workload;
/* Variables needed for AlgoSimple and AlgoRatio */
double best_time;
int best_workload;
/* Variables needed for AlgoBisection */
int a;
int b;
int c;
int x;
int limit;
double fb, fx;
/* Autotuning algorithms */
void AlgoSimple(double compTime);
void AlgoRatio(double compTime);
void AlgoBisection(double compTime);
};
#endif
......@@ -42,6 +42,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......@@ -71,6 +72,7 @@ protected:
int FFTAlgo; //Use the FFT Algorithm (Default 1)
int DebugOutput; //Debug Output Level (Default 2)
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1)
bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
};
#endif
......@@ -33,6 +33,7 @@ public:
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
protected:
virtual int deviceInit();
......
......@@ -91,6 +91,18 @@ struct myfloat3_t
#define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
/* Autotuning
Autotuning algorithms:
1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing.
2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed.
3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found.
*/
#define AUTOTUNING_ALGORITHM 3
/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */
#define RECALIB_FACTOR 200
/* After how many comparison iterations, comparison duration becomes stable */
#define FIRST_STABLE 7
static inline void* mallocchk(size_t size)
{
void* ptr = malloc(size);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment