diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b139634303db68b244b07e5a4d513c30d47b9e9..8293185713fa5809adf3a5f85d238f2eec82585c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}") endif() -set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp") +set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp") ###Find Required Packages find_package(PkgConfig) diff --git a/autotuner.cpp b/autotuner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..84c942e9c6cf7b08c6d57c21b5ff645bd16472c0 --- /dev/null +++ b/autotuner.cpp @@ -0,0 +1,125 @@ +#include "autotuner.h" + +void Autotuner::Reset() +{ + stopTuning = false; + workload = 100; + + best_time = 0.; + best_workload = 0; + + a = 1; + b = 50; + c = 100; + x = 50; + limit = 1; + fb = 0.; + fx = 0.; + + if (algo == 3) workload = 50; +} + +bool Autotuner::Needed(int iteration) +{ + if (stopTuning) return false; + + switch (algo) + { + case 1: + case 3: + return iteration % (stable + 1) == stable; + case 2: return (iteration == (int) stable / 2 ) || (iteration == stable); + default: /* Should never happen */; + } + return false; +} + +bool Autotuner::Finished() +{ + switch (algo) + { + case 1: + if (workload < 30) + { + workload = best_workload; + return stopTuning = true; + } + break; + case 2: + if (best_workload != 0) return stopTuning = true; + break; + case 3: + if ((c - b == limit) && (b - a == limit)) return stopTuning = true; + break; + default: /* Should never happen */; + } + return false; +} + +void Autotuner::Tune(double compTime) +{ + switch (algo) + { + case 1: AlgoSimple(compTime); break; + case 2: AlgoRatio(compTime); break; + case 3: AlgoBisection(compTime); break; + default: /* Should never happen */; + } +} + +void Autotuner::AlgoSimple(double compTime) +{ + if (best_time == 0. || compTime < best_time) + { + best_time = compTime; + best_workload = workload; + } + + workload -= 5; +} +void Autotuner::AlgoRatio(double compTime) +{ + if (best_time == 0.) + { + best_time = compTime; + workload = 1; + } + else + { + best_workload = (int) 100 * (compTime / (best_time + compTime)); + workload = best_workload; + } +} + +void Autotuner::AlgoBisection(double compTime) +{ + if (fb == 0.) + { + fb = compTime; + x = 75; + workload = x; + return; + } + + fx = compTime; + + if (fx < fb) + { + if (x < b) + c = b; + else + a = b; + b = x; + fb = fx; + } + else + { + if (x < b) + a = x; + else + c = x; + } + + x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2); + workload = x; +} diff --git a/bioem.cpp b/bioem.cpp index 69b445a6a4c13611b71832bfe2de6e6ffb693b48..44414d698562507e48c506fe0322a1cc67e0aca8 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -14,12 +14,6 @@ #ifdef WITH_MPI #include -/* Recalibrate every X projections */ -#define RECALIB_FACTOR 200 -/* After how many comparison iterations, comparison duration becomes stable */ -#define FIRST_STABLE 7 -#define STABLE_ITERATION(i) (i % (FIRST_STABLE + 1) == FIRST_STABLE) - #define MPI_CHK(expr) \ if (expr != MPI_SUCCESS) \ { \ @@ -47,6 +41,7 @@ #include #include #include "timer.h" +#include "autotuner.h" #include "param.h" #include "bioem.h" @@ -103,7 +98,13 @@ bioem::bioem() FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO")); DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT")); nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE")); - Autotuning = getenv("BIOEM_AUTOTUNING") == NULL ? 0 : atoi(getenv("BIOEM_AUTOTUNING")); + Autotuning = false; + if (getenv("GPU") && atoi(getenv("GPU"))) + if (!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1)) + if (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)) + { + Autotuning = true; + } } bioem::~bioem() @@ -532,17 +533,12 @@ int bioem::run() HighResTimer timer, timer2; - /* This variables are used for Autotuning */ - double best_time = 0; - int workload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); - int best_workload = workload; - bool stopTuning=false; - int a=1, b=50, c=100, x=75, limit=1; - double fb=0., fx=0.; - if (Autotuning == 3) + /* Autotuning */ + Autotuner aut; + if (Autotuning) { - workload=b; - rebalance(b); + aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE); + rebalance(aut.Workload()); } if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size); @@ -576,15 +572,12 @@ int bioem::run() for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++) { /* Recalibrate if needed */ - if (((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && ((iTmpEnd - iOrient) > RECALIB_FACTOR) && (Autotuning == 3) ) - { - a=1, b=50, c=100, x=75, limit=1; - fb=0., fx=0.; - workload=b; - rebalance(b); - stopTuning=false; + if (Autotuning && ((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && ((iTmpEnd - iOrient) > RECALIB_FACTOR)) + { + aut.Reset(); + rebalance(aut.Workload()); } - + mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]; // *************************************************************************************** @@ -598,8 +591,7 @@ int bioem::run() createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank); - if (Autotuning && !stopTuning) timer.ResetStart(); - if (DebugOutput >= 2) timer.ResetStart(); + if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart(); myfloat_t amp,pha,env; amp=param.CtfParam[iConv].pos[0]; @@ -622,90 +614,15 @@ int bioem::run() (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime; const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime; - printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., workload, mpi_rank); - } - if (Autotuning == 1 && !stopTuning && STABLE_ITERATION(iConv)) - { - if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); - - if (best_time == 0 || compTime < best_time) - { - best_time = compTime; - best_workload = workload; - } - - workload -= 5; - if (workload < 30) - { - stopTuning=true; - workload=best_workload; - } - rebalance(workload); - } - if (Autotuning == 2 && !stopTuning && (iConv == 3 || iConv == 7)) - { - if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); - - if (iConv == 3) - { - best_time = compTime; - workload = 1; - } - else if (iConv == 7) - { - workload = (int) 100 * ( compTime / (best_time+compTime) ); - if (DebugOutput >= 2) - { - printf("\t\tComparison on GPU only time: %.6f\n", best_time); - printf("\t\tComparison on CPU only time: %.6f\n", compTime); - printf("\t\tOptimal GPU workload: %d%%\n", workload); - } - stopTuning=true; - } - rebalance(workload); + if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank); + else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank); } - if (Autotuning == 3 && !stopTuning && STABLE_ITERATION(iConv)) + if (Autotuning && aut.Needed(iConv)) { if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); - - if (((iOrient - iOrientStart) % RECALIB_FACTOR == 0) && (iConv == FIRST_STABLE)) - { - fb = compTime; - x = 75; - } - else - { - fx = compTime; - if (fx < fb) - { - if (x < b) - c = b; - else - a = b; - b = x; - fb = fx; - } - else - { - if (x < b) - a = x; - else - c = x; - } - x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2); - } - - if ((c - b == limit) && (b - a == limit)) - { - stopTuning=true; - if (DebugOutput >= 2) - { - printf("\t\tOptimal GPU workload %d%% (rank %d)\n", workload, mpi_rank); - } - } - - workload=x; - rebalance(x); + aut.Tune(compTime); + if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank); + rebalance(aut.Workload()); } } if (DebugOutput >= 1) diff --git a/bioem_cuda.cu b/bioem_cuda.cu index ed41ee0b9a1821f6db03099ddee512d7dad4521d..701fbaabbc6d9d2d97d416d0bc1dac9f694fe149 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -137,6 +137,7 @@ bioem_cuda::bioem_cuda() GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC")); GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); + if (GPUWorkload == -1) GPUWorkload = 100; GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM")); } diff --git a/include/autotuner.h b/include/autotuner.h new file mode 100644 index 0000000000000000000000000000000000000000..10db9ca8d21810f883d4d3bbf74dd9895a9e1498 --- /dev/null +++ b/include/autotuner.h @@ -0,0 +1,62 @@ +/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + < BioEM software for Bayesian inference of Electron Microscopy images> + Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, + Volker Lindenstruth and Gerhard Hummer. + Max Planck Institute of Biophysics, Frankfurt, Germany. + + See license statement for terms of distribution. + + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +#ifndef AUTOTUNER_H +#define AUTOTUNER_H + +class Autotuner { + +public: + Autotuner() {stopTuning = true;} + + /* Setting variables to initial values */ + inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); } + + /* Resetting variables to initial values */ + void Reset(); + + /* Check if autotuning is needed, depending on which comparison is finished */ + bool Needed(int iteration); + + /* Check if optimal workload value has been computed */ + bool Finished(); + + /* Set a new workload value to test, depending on the algorithm */ + void Tune(double compTime); + + /* Return workload value */ + inline int Workload() {return workload;} + +private: + int algo; + int stable; + + bool stopTuning; + int workload; + + /* Variables needed for AlgoSimple and AlgoRatio */ + double best_time; + int best_workload; + + /* Variables needed for AlgoBisection */ + int a; + int b; + int c; + int x; + int limit; + double fb, fx; + + /* Autotuning algorithms */ + void AlgoSimple(double compTime); + void AlgoRatio(double compTime); + void AlgoBisection(double compTime); +}; + +#endif diff --git a/include/bioem.h b/include/bioem.h index 5714f52487ad32b302ce2eee0a4853ac67f3ec5c..3d9df2951251d8ec2992185ca2cdaf4b35a9396c 100644 --- a/include/bioem.h +++ b/include/bioem.h @@ -72,7 +72,7 @@ protected: int FFTAlgo; //Use the FFT Algorithm (Default 1) int DebugOutput; //Debug Output Level (Default 2) int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1) - int Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs + bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs }; #endif diff --git a/include/defs.h b/include/defs.h index b9635a935331bdbca489f4983364126acd98cbae..5e4c5fa67d49ffc56f43a9de9af5893241c45f7f 100644 --- a/include/defs.h +++ b/include/defs.h @@ -91,6 +91,18 @@ struct myfloat3_t #define CUDA_FFTS_AT_ONCE 1024 //#define BIOEM_USE_NVTX +/* Autotuning + Autotuning algorithms: + 1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing. + 2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed. + 3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found. + */ +#define AUTOTUNING_ALGORITHM 3 +/* Recalibrate every X projections */ +#define RECALIB_FACTOR 200 +/* After how many comparison iterations, comparison duration becomes stable */ +#define FIRST_STABLE 7 + static inline void* mallocchk(size_t size) { void* ptr = malloc(size);