diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f904164e0528d4db2f6c1b280d9a62aa562a9b9..6e3cda236a394a5a0b174ace3043a6736b5ba42d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}") endif() -set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp") +set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp") ###Find Required Packages find_package(PkgConfig) diff --git a/autotuner.cpp b/autotuner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..125fc6c651579f8c0259a773a1326d410a716273 --- /dev/null +++ b/autotuner.cpp @@ -0,0 +1,126 @@ +#include "autotuner.h" + +void Autotuner::Reset() +{ + stopTuning = false; + workload = 100; + + best_time = 0.; + best_workload = 0; + + a = 1; + b = 50; + c = 100; + x = 50; + limit = 1; + fb = 0.; + fx = 0.; + + if (algo == 3) workload = 50; +} + +bool Autotuner::Needed(int iteration) +{ + if (stopTuning) return false; + + switch (algo) + { + case 1: + case 3: + return iteration % (stable + 1) == stable; + case 2: return (iteration == (int) stable / 2 ) || (iteration == stable); + default: /* Should never happen */; + } + return false; +} + +bool Autotuner::Finished() +{ + switch (algo) + { + case 1: + if (workload < 30) + { + workload = best_workload; + return stopTuning = true; + } + break; + case 2: + if (best_workload != 0) return stopTuning = true; + break; + case 3: + if ((c - b == limit) && (b - a == limit)) return stopTuning = true; + break; + default: /* Should never happen */; + } + return false; +} + +void Autotuner::Tune(double compTime) +{ + switch (algo) + { + case 1: AlgoSimple(compTime); break; + case 2: AlgoRatio(compTime); break; + case 3: AlgoBisection(compTime); break; + default: /* Should never happen */; + } +} + +void Autotuner::AlgoSimple(double compTime) +{ + if (best_time == 0. || compTime < best_time) + { + best_time = compTime; + best_workload = workload; + } + + workload -= 5; +} + +void Autotuner::AlgoRatio(double compTime) +{ + if (best_time == 0.) + { + best_time = compTime; + workload = 1; + } + else + { + best_workload = (int) 100 * (compTime / (best_time + compTime)); + workload = best_workload; + } +} + +void Autotuner::AlgoBisection(double compTime) +{ + if (fb == 0.) + { + fb = compTime; + x = 75; + workload = x; + return; + } + + fx = compTime; + + if (fx < fb) + { + if (x < b) + c = b; + else + a = b; + b = x; + fb = fx; + } + else + { + if (x < b) + a = x; + else + c = x; + } + + x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2); + workload = x; +} diff --git a/bioem.cpp b/bioem.cpp index b7457e348d5966ae76a84ae85e6f1801e984c848..ddaf52e6147bc2beff195fcbe8c16a90206a8898 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -41,6 +41,7 @@ #include <fftw3.h> #include <math.h> #include "timer.h" +#include "autotuner.h" #include "param.h" #include "bioem.h" @@ -97,6 +98,7 @@ bioem::bioem() FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO")); DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT")); nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE")); + Autotuning = false; } bioem::~bioem() @@ -363,6 +365,19 @@ int bioem::configure(int ac, char* av[]) printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime()); timer.ResetStart(); } + + // ****************** For autotuning ********************** + if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE))) + { + Autotuning = true; + if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE); + } + else + { + Autotuning = false; + if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n"); + } + // ****************** Initializing pointers ********************* deviceInit(); @@ -524,6 +539,14 @@ int bioem::run() HighResTimer timer, timer2; + /* Autotuning */ + Autotuner aut; + if (Autotuning) + { + aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE); + rebalance(aut.Workload()); + } + if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size); @@ -552,6 +575,13 @@ int bioem::run() } if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank); + /* Recalibrate if needed */ + if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart)) + { + aut.Reset(); + rebalance(aut.Workload()); + } + for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++) { mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]; @@ -567,8 +597,7 @@ int bioem::run() createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank); - - if (DebugOutput >= 2) timer.ResetStart(); + if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart(); myfloat_t amp,pha,env; amp=param.CtfParam[iConv].pos[0]; @@ -580,9 +609,10 @@ int bioem::run() compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); + double compTime=0.; if (DebugOutput >= 2) { - const double compTime = timer.GetCurrentElapsedTime(); + compTime = timer.GetCurrentElapsedTime(); const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts * (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime; @@ -590,7 +620,15 @@ int bioem::run() (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime; const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime; - printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank); + if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank); + else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank); + } + if (Autotuning && aut.Needed(iConv)) + { + if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); + aut.Tune(compTime); + if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank); + rebalance(aut.Workload()); } } if (DebugOutput >= 1) @@ -1417,3 +1455,5 @@ void bioem::free_device_host(void* ptr) { free(ptr); } + +void bioem::rebalance(int workload) {} diff --git a/bioem_cuda.cu b/bioem_cuda.cu index 482433153434d89e2e7dc41c7afeb99ce343486f..71a5c97ee2f30633ad69d64f139fc5cac4064798 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -137,6 +137,7 @@ bioem_cuda::bioem_cuda() GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC")); GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); + if (GPUWorkload == -1) GPUWorkload = 100; GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM")); } @@ -699,6 +700,23 @@ void bioem_cuda::free_device_host(void* ptr) cudaFreeHost(ptr); } +void bioem_cuda::rebalance(int workload) +{ + if ((workload < 0) || (workload > 100) || (workload == GPUWorkload)) return; + + deviceFinishRun(); + + if (DebugOutput >= 2) + { + printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank); + } + + GPUWorkload = workload; + maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100; + + deviceStartRun(); +} + bioem* bioem_cuda_create() { int count; diff --git a/include/autotuner.h b/include/autotuner.h new file mode 100644 index 0000000000000000000000000000000000000000..10db9ca8d21810f883d4d3bbf74dd9895a9e1498 --- /dev/null +++ b/include/autotuner.h @@ -0,0 +1,62 @@ +/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + < BioEM software for Bayesian inference of Electron Microscopy images> + Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, + Volker Lindenstruth and Gerhard Hummer. + Max Planck Institute of Biophysics, Frankfurt, Germany. + + See license statement for terms of distribution. + + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +#ifndef AUTOTUNER_H +#define AUTOTUNER_H + +class Autotuner { + +public: + Autotuner() {stopTuning = true;} + + /* Setting variables to initial values */ + inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); } + + /* Resetting variables to initial values */ + void Reset(); + + /* Check if autotuning is needed, depending on which comparison is finished */ + bool Needed(int iteration); + + /* Check if optimal workload value has been computed */ + bool Finished(); + + /* Set a new workload value to test, depending on the algorithm */ + void Tune(double compTime); + + /* Return workload value */ + inline int Workload() {return workload;} + +private: + int algo; + int stable; + + bool stopTuning; + int workload; + + /* Variables needed for AlgoSimple and AlgoRatio */ + double best_time; + int best_workload; + + /* Variables needed for AlgoBisection */ + int a; + int b; + int c; + int x; + int limit; + double fb, fx; + + /* Autotuning algorithms */ + void AlgoSimple(double compTime); + void AlgoRatio(double compTime); + void AlgoBisection(double compTime); +}; + +#endif diff --git a/include/bioem.h b/include/bioem.h index 7c0e0d91a9008981567f240152843f14fcce8e39..98222791db36eb60d1061d63c74793719364cf24 100644 --- a/include/bioem.h +++ b/include/bioem.h @@ -42,6 +42,7 @@ public: virtual void* malloc_device_host(size_t size); virtual void free_device_host(void* ptr); + virtual void rebalance(int workload); //Rebalance GPUWorkload int createProjection(int iMap, mycomplex_t* map); int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare); @@ -71,6 +72,7 @@ protected: int FFTAlgo; //Use the FFT Algorithm (Default 1) int DebugOutput; //Debug Output Level (Default 2) int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1) + bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs }; #endif diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index 4d44c5e1c3904c4048566dc578e3d38b7f6b6de2..708b40fb3e9a2a10b265965baaa224244159b975 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -33,6 +33,7 @@ public: virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); virtual void* malloc_device_host(size_t size); virtual void free_device_host(void* ptr); + virtual void rebalance(int workload); //Rebalance GPUWorkload protected: virtual int deviceInit(); diff --git a/include/defs.h b/include/defs.h index b9635a935331bdbca489f4983364126acd98cbae..b7338ca8246126becf8cebce6191b0fa7ca86ec6 100644 --- a/include/defs.h +++ b/include/defs.h @@ -91,6 +91,18 @@ struct myfloat3_t #define CUDA_FFTS_AT_ONCE 1024 //#define BIOEM_USE_NVTX +/* Autotuning + Autotuning algorithms: + 1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing. + 2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed. + 3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found. + */ +#define AUTOTUNING_ALGORITHM 3 +/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */ +#define RECALIB_FACTOR 200 +/* After how many comparison iterations, comparison duration becomes stable */ +#define FIRST_STABLE 7 + static inline void* mallocchk(size_t size) { void* ptr = malloc(size);