Commit efa38150 authored by Pilar Cossio's avatar Pilar Cossio

Merge branch 'BioEM-1.0' into 'BioEM-1.0'

minor fixes

See merge request !2
parents 665e1ec1 dc843f20
Pipeline #14456 passed with stage
in 40 seconds
......@@ -37,7 +37,7 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}")
endif()
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp")
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp")
###Find Required Packages
find_package(PkgConfig)
......@@ -67,6 +67,7 @@ if (CUDA_FOUND)
if (CUDA_FORCE_GCC)
cmake_minimum_required(VERSION 2.8.10.1)
#Use GCC as host compiler for CUDA even though host compiler for other files is not GCC
#set (CUDA_HOST_COMPILER /mpcdf/soft/SLES122/common/gcc/5.4.0/bin/gcc)
set (CUDA_HOST_COMPILER gcc)
endif()
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--use_fast_math;-ftz=true;-O4;-Xptxas -O4")
......
File mode changed from 100644 to 100755
#include "autotuner.h"
void Autotuner::Reset()
{
stopTuning = false;
workload = 100;
best_time = 0.;
best_workload = 0;
a = 1;
b = 50;
c = 100;
x = 50;
limit = 1;
fb = 0.;
fx = 0.;
if (algo == 3) workload = 50;
}
bool Autotuner::Needed(int iteration)
{
if (stopTuning) return false;
switch (algo)
{
case 1:
case 3:
return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
default: /* Should never happen */;
}
return false;
}
bool Autotuner::Finished()
{
switch (algo)
{
case 1:
if (workload < 30)
{
workload = best_workload;
return stopTuning = true;
}
break;
case 2:
if (best_workload != 0) return stopTuning = true;
break;
case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
break;
default: /* Should never happen */;
}
return false;
}
void Autotuner::Tune(double compTime)
{
switch (algo)
{
case 1: AlgoSimple(compTime); break;
case 2: AlgoRatio(compTime); break;
case 3: AlgoBisection(compTime); break;
default: /* Should never happen */;
}
}
void Autotuner::AlgoSimple(double compTime)
{
if (best_time == 0. || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
}
void Autotuner::AlgoRatio(double compTime)
{
if (best_time == 0.)
{
best_time = compTime;
workload = 1;
}
else
{
best_workload = (int) 100 * (compTime / (best_time + compTime));
workload = best_workload;
}
}
void Autotuner::AlgoBisection(double compTime)
{
if (fb == 0.)
{
fb = compTime;
x = 75;
workload = x;
return;
}
fx = compTime;
if (fx < fb)
{
if (x < b)
c = b;
else
a = b;
b = x;
fb = fx;
}
else
{
if (x < b)
a = x;
else
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
workload = x;
}
......@@ -41,6 +41,7 @@
#include <fftw3.h>
#include <math.h>
#include "timer.h"
#include "autotuner.h"
#include "param.h"
#include "bioem.h"
......@@ -97,6 +98,7 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = false;
}
bioem::~bioem()
......@@ -363,19 +365,36 @@ int bioem::configure(int ac, char* av[])
printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
if(!param.printModel)pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, *this);
if (DebugOutput >= 2 && mpi_rank == 0)
// ****************** For autotuning **********************
if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
{
printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
Autotuning = true;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
}
else
{
Autotuning = false;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n");
}
// ****************** Initializng pointers *********************
// ****************** Initializing pointers *********************
deviceInit();
if (DebugOutput >= 2 && mpi_rank == 0) printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
if (DebugOutput >= 2 && mpi_rank == 0)
{
printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
if(!param.printModel)pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, *this);
if (DebugOutput >= 2 && mpi_rank == 0)
{
printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
return(0);
}
......@@ -520,6 +539,14 @@ int bioem::run()
HighResTimer timer, timer2;
/* Autotuning */
Autotuner aut;
if (Autotuning)
{
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -548,6 +575,13 @@ int bioem::run()
}
if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
/* Recalibrate if needed */
if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
{
aut.Reset();
rebalance(aut.Workload());
}
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
{
mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
......@@ -563,8 +597,7 @@ int bioem::run()
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
if (DebugOutput >= 2) timer.ResetStart();
if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
amp=param.CtfParam[iConv].pos[0];
......@@ -576,9 +609,10 @@ int bioem::run()
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
if (DebugOutput >= 2)
{
const double compTime = timer.GetCurrentElapsedTime();
compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
......@@ -586,7 +620,15 @@ int bioem::run()
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning && aut.Needed(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
}
}
if (DebugOutput >= 1)
......@@ -1413,3 +1455,5 @@ void bioem::free_device_host(void* ptr)
{
free(ptr);
}
void bioem::rebalance(int workload) {}
This diff is collapsed.
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#ifndef AUTOTUNER_H
#define AUTOTUNER_H
class Autotuner {
public:
Autotuner() {stopTuning = true;}
/* Setting variables to initial values */
inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
/* Resetting variables to initial values */
void Reset();
/* Check if autotuning is needed, depending on which comparison is finished */
bool Needed(int iteration);
/* Check if optimal workload value has been computed */
bool Finished();
/* Set a new workload value to test, depending on the algorithm */
void Tune(double compTime);
/* Return workload value */
inline int Workload() {return workload;}
private:
int algo;
int stable;
bool stopTuning;
int workload;
/* Variables needed for AlgoSimple and AlgoRatio */
double best_time;
int best_workload;
/* Variables needed for AlgoBisection */
int a;
int b;
int c;
int x;
int limit;
double fb, fx;
/* Autotuning algorithms */
void AlgoSimple(double compTime);
void AlgoRatio(double compTime);
void AlgoBisection(double compTime);
};
#endif
......@@ -42,6 +42,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......@@ -71,6 +72,7 @@ protected:
int FFTAlgo; //Use the FFT Algorithm (Default 1)
int DebugOutput; //Debug Output Level (Default 2)
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1)
bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
};
#endif
......@@ -33,6 +33,7 @@ public:
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload); //Rebalance GPUWorkload
protected:
virtual int deviceInit();
......
......@@ -91,6 +91,18 @@ struct myfloat3_t
#define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
/* Autotuning
Autotuning algorithms:
1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing.
2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed.
3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found.
*/
#define AUTOTUNING_ALGORITHM 3
/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */
#define RECALIB_FACTOR 200
/* After how many comparison iterations, comparison duration becomes stable */
#define FIRST_STABLE 7
static inline void* mallocchk(size_t size)
{
void* ptr = malloc(size);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment