diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f904164e0528d4db2f6c1b280d9a62aa562a9b9..6e3cda236a394a5a0b174ace3043a6736b5ba42d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}")
 endif()
 
-set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp")
+set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp")
 
 ###Find Required Packages
 find_package(PkgConfig)
diff --git a/autotuner.cpp b/autotuner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..125fc6c651579f8c0259a773a1326d410a716273
--- /dev/null
+++ b/autotuner.cpp
@@ -0,0 +1,126 @@
+#include "autotuner.h"
+
+void Autotuner::Reset()
+{
+  stopTuning = false;
+  workload = 100;
+
+  best_time = 0.;
+  best_workload = 0;
+
+  a = 1;
+  b = 50;
+  c = 100;
+  x = 50;
+  limit = 1;
+  fb = 0.;
+  fx = 0.;
+
+  if (algo == 3) workload = 50;
+}
+
+bool Autotuner::Needed(int iteration)
+{
+  if (stopTuning) return false;
+
+  switch (algo)
+    {
+    case 1:
+    case 3:
+      return iteration % (stable + 1) == stable;
+    case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
+    default: /* Should never happen */;
+    }
+  return false;
+}
+
+bool Autotuner::Finished()
+{
+  switch (algo)
+    {
+    case 1:
+      if (workload < 30)
+	{
+	  workload = best_workload;
+	  return stopTuning = true;
+	}
+      break;
+    case 2:
+      if (best_workload != 0) return stopTuning = true;
+      break;
+    case 3:
+      if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
+      break;
+    default: /* Should never happen */;
+    }
+  return false;
+}
+
+void Autotuner::Tune(double compTime)
+{
+  switch (algo)
+    {
+    case 1: AlgoSimple(compTime); break;
+    case 2: AlgoRatio(compTime); break;
+    case 3: AlgoBisection(compTime); break;
+    default: /* Should never happen */;
+    }
+}
+
+void Autotuner::AlgoSimple(double compTime)
+{
+  if (best_time == 0. || compTime < best_time)
+    {
+      best_time = compTime;
+      best_workload = workload;
+    }
+
+  workload -= 5;
+}
+
+void Autotuner::AlgoRatio(double compTime)
+{
+  if (best_time == 0.)
+    {
+      best_time = compTime;
+      workload = 1;
+    }
+  else
+    {
+      best_workload = (int) 100 * (compTime / (best_time + compTime));
+      workload = best_workload;
+    }
+}
+
+void Autotuner::AlgoBisection(double compTime)
+{
+  if (fb == 0.)
+    {
+      fb = compTime;
+      x = 75;
+      workload = x;
+      return;
+    }
+
+  fx = compTime;
+
+  if (fx < fb)
+    {
+      if (x < b)
+	c = b;
+      else
+	a = b;
+      b = x;
+      fb = fx;
+    }
+  else
+    {
+      if (x < b)
+	a = x;
+      else
+	c = x;
+    }
+
+  x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
+  workload = x;
+}
diff --git a/bioem.cpp b/bioem.cpp
index b7457e348d5966ae76a84ae85e6f1801e984c848..ddaf52e6147bc2beff195fcbe8c16a90206a8898 100644
--- a/bioem.cpp
+++ b/bioem.cpp
@@ -41,6 +41,7 @@
 #include <fftw3.h>
 #include <math.h>
 #include "timer.h"
+#include "autotuner.h"
 
 #include "param.h"
 #include "bioem.h"
@@ -97,6 +98,7 @@ bioem::bioem()
   FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
   DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
   nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
+  Autotuning = false;
 }
 
 bioem::~bioem()
@@ -363,6 +365,19 @@ int bioem::configure(int ac, char* av[])
       printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
       timer.ResetStart();
     }
+
+  // ****************** For autotuning **********************
+  if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
+    {
+      Autotuning = true;
+      if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
+    }
+  else
+    {
+      Autotuning = false;
+      if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n");
+    }
+
   // ****************** Initializing pointers *********************
 
   deviceInit();
@@ -524,6 +539,14 @@ int bioem::run()
 
   HighResTimer timer, timer2;
 
+  /* Autotuning */
+  Autotuner aut;
+  if (Autotuning)
+    {
+      aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
+      rebalance(aut.Workload());      
+    }
+
   if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
 
 
@@ -552,6 +575,13 @@ int bioem::run()
 	}
       if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
 
+      /* Recalibrate if needed */
+      if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
+	{
+	  aut.Reset();
+	  rebalance(aut.Workload());
+	}
+
       for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
 	{
 	  mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
@@ -567,8 +597,7 @@ int bioem::run()
 	      createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
 	      if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
 
-	  
-	      if (DebugOutput >= 2) timer.ResetStart();
+      	      if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
      	      myfloat_t amp,pha,env;
 
               amp=param.CtfParam[iConv].pos[0];
@@ -580,9 +609,10 @@ int bioem::run()
 
 	      compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
 
+	      double compTime=0.;
 	      if (DebugOutput >= 2)
 		{
-		  const double compTime = timer.GetCurrentElapsedTime();
+		  compTime = timer.GetCurrentElapsedTime();
 		  const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
 		  const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
 		    (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
@@ -590,7 +620,15 @@ int bioem::run()
 		    (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
 		  const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
 
-		  printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
+		  if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
+		  else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
+		}
+	      if (Autotuning && aut.Needed(iConv))
+		{
+		  if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
+		  aut.Tune(compTime);
+		  if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
+		  rebalance(aut.Workload());
 		}
 	    }
 	  if (DebugOutput >= 1)
@@ -1417,3 +1455,5 @@ void bioem::free_device_host(void* ptr)
 {
   free(ptr);
 }
+
+void bioem::rebalance(int workload) {}
diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index 482433153434d89e2e7dc41c7afeb99ce343486f..71a5c97ee2f30633ad69d64f139fc5cac4064798 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -137,6 +137,7 @@ bioem_cuda::bioem_cuda()
 	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
 	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
 	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
+	if (GPUWorkload == -1) GPUWorkload = 100;
 	GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
 }
 
@@ -699,6 +700,23 @@ void bioem_cuda::free_device_host(void* ptr)
 	cudaFreeHost(ptr);
 }
 
+void bioem_cuda::rebalance(int workload)
+{
+	if ((workload < 0) || (workload > 100) || (workload == GPUWorkload)) return;
+
+	deviceFinishRun();
+
+	if (DebugOutput >= 2)
+	{
+	  printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank);
+	}
+
+	GPUWorkload = workload;
+	maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
+
+	deviceStartRun();
+}
+
 bioem* bioem_cuda_create()
 {
 	int count;
diff --git a/include/autotuner.h b/include/autotuner.h
new file mode 100644
index 0000000000000000000000000000000000000000..10db9ca8d21810f883d4d3bbf74dd9895a9e1498
--- /dev/null
+++ b/include/autotuner.h
@@ -0,0 +1,62 @@
+/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   < BioEM software for Bayesian inference of Electron Microscopy images>
+   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
+        Volker Lindenstruth and Gerhard Hummer.
+   Max Planck Institute of Biophysics, Frankfurt, Germany.
+
+   See license statement for terms of distribution.
+
+   ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
+#ifndef AUTOTUNER_H
+#define AUTOTUNER_H
+
+class Autotuner {
+
+public:
+	Autotuner() {stopTuning = true;}
+
+	/* Setting variables to initial values */
+	inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
+
+	/* Resetting variables to initial values */
+	void Reset();
+
+	/* Check if autotuning is needed, depending on which comparison is finished */
+	bool Needed(int iteration);
+
+	/* Check if optimal workload value has been computed */
+	bool Finished();
+
+	/* Set a new workload value to test, depending on the algorithm */
+	void Tune(double compTime);
+
+	/* Return workload value */
+	inline int Workload() {return workload;}
+
+private:
+	int algo;
+	int stable;
+
+	bool stopTuning;
+	int workload;
+
+	/* Variables needed for AlgoSimple and AlgoRatio */
+	double best_time;
+	int best_workload;
+
+	/* Variables needed for AlgoBisection */
+	int a;
+	int b;
+	int c;
+	int x;
+	int limit;
+	double fb, fx;
+
+	/* Autotuning algorithms */
+	void AlgoSimple(double compTime);
+	void AlgoRatio(double compTime);
+	void AlgoBisection(double compTime);
+};
+
+#endif
diff --git a/include/bioem.h b/include/bioem.h
index 7c0e0d91a9008981567f240152843f14fcce8e39..98222791db36eb60d1061d63c74793719364cf24 100644
--- a/include/bioem.h
+++ b/include/bioem.h
@@ -42,6 +42,7 @@ public:
 
 	virtual void* malloc_device_host(size_t size);
 	virtual void free_device_host(void* ptr);
+	virtual void rebalance(int workload); //Rebalance GPUWorkload
 
 	int createProjection(int iMap, mycomplex_t* map);
 	int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
@@ -71,6 +72,7 @@ protected:
 	int FFTAlgo;				//Use the FFT Algorithm (Default 1)
 	int DebugOutput;			//Debug Output Level (Default 2)
 	int nProjectionsAtOnce;		//Number of projections to do at once via OpenMP (Default 1)
+	bool Autotuning;				//Do the autotuning of the load-balancing between CPUs and GPUs
 };
 
 #endif
diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h
index 4d44c5e1c3904c4048566dc578e3d38b7f6b6de2..708b40fb3e9a2a10b265965baaa224244159b975 100644
--- a/include/bioem_cuda_internal.h
+++ b/include/bioem_cuda_internal.h
@@ -33,6 +33,7 @@ public:
 	virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
 	virtual void* malloc_device_host(size_t size);
 	virtual void free_device_host(void* ptr);
+	virtual void rebalance(int workload); //Rebalance GPUWorkload
 
 protected:
 	virtual int deviceInit();
diff --git a/include/defs.h b/include/defs.h
index b9635a935331bdbca489f4983364126acd98cbae..b7338ca8246126becf8cebce6191b0fa7ca86ec6 100644
--- a/include/defs.h
+++ b/include/defs.h
@@ -91,6 +91,18 @@ struct myfloat3_t
 #define CUDA_FFTS_AT_ONCE 1024
 //#define BIOEM_USE_NVTX
 
+/* Autotuning
+   Autotuning algorithms:
+    1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing.
+    2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed.
+    3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found.
+ */
+#define AUTOTUNING_ALGORITHM 3
+/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */
+#define RECALIB_FACTOR 200
+/* After how many comparison iterations, comparison duration becomes stable */
+#define FIRST_STABLE 7
+
 static inline void* mallocchk(size_t size)
 {
 	void* ptr = malloc(size);