diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd731f9a7896cfebf6d49a81eba7bd93c5c78da6..3b077b8d7681ce5beee3bc66b2e03d47206662e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
 
 include_directories(include)
 
-set (BIOEM_ICC_FLAGS "-xHost -O3 -fno-alias -fno-fnalias -unroll -g0 -ipo")
+set (BIOEM_ICC_FLAGS "-O3 -fno-alias -fno-fnalias -unroll -g0 -ip")
 set (BIOEM_GCC_FLAGS "-O3 -march=native -fweb -mfpmath=sse -frename-registers -minline-all-stringops -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -ggdb")
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
@@ -50,11 +50,6 @@ if (NOT FFTW_FOUND)
 endif()
 include_directories(${FFTW_INCLUDE_DIRS})
 
-find_package( Boost 1.43 REQUIRED COMPONENTS program_options )
-include_directories( ${Boost_INCLUDE_DIRS} )
-
-
-
 ###Find Optional Packages
 
 ###Find CUDA
@@ -163,7 +158,6 @@ if (FFTWF_LIBRARIES)
 else()
         target_link_libraries(bioEM -L${FFTW_LIBDIR} -lfftw3 -lfftw3f)
 endif()
-target_link_libraries(bioEM ${Boost_PROGRAM_OPTIONS_LIBRARY})
 
 if (MPI_FOUND)
         target_link_libraries(bioEM ${MPI_LIBRARIES})
@@ -172,7 +166,6 @@ endif()
 ###Show Status
 message(STATUS "Build Status")
 message(STATUS "FFTW library: ${FFTW_LIBDIR}")
-message(STATUS "Boost directory: ${Boost_LIBRARY_DIRS}")
 message(STATUS "FFTW includedir: ${FFTW_INCLUDEDIR}")
 message(STATUS "CUDA libraries:  ${CUDA_CUDA_LIBRARY}")
 message(STATUS "CUDART libraries:  ${CUDA_LIBRARIES}")
diff --git a/autotuner.cpp b/autotuner.cpp
index 125fc6c651579f8c0259a773a1326d410a716273..c31cd427d4ce7e6116e23a9390d39647356609c3 100644
--- a/autotuner.cpp
+++ b/autotuner.cpp
@@ -1,3 +1,15 @@
+/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   < BioEM software for Bayesian inference of Electron Microscopy images>
+   Copyright (C) 2017 Pilar Cossio, Markus Rampp, Luka Stanisic and Gerhard
+   Hummer.
+   Max Planck Institute of Biophysics, Frankfurt, Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
+
+   Released under the GNU Public License, v3.
+   See license statement for terms of distribution.
+
+   ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
 #include "autotuner.h"
 
 void Autotuner::Reset()
@@ -16,64 +28,75 @@ void Autotuner::Reset()
   fb = 0.;
   fx = 0.;
 
-  if (algo == 3) workload = 50;
+  if (algo == 3)
+    workload = 50;
 }
 
 bool Autotuner::Needed(int iteration)
 {
-  if (stopTuning) return false;
+  if (stopTuning)
+    return false;
 
   switch (algo)
-    {
+  {
     case 1:
     case 3:
       return iteration % (stable + 1) == stable;
-    case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
+    case 2:
+      return (iteration == (int) stable / 2) || (iteration == stable);
     default: /* Should never happen */;
-    }
+  }
   return false;
 }
 
 bool Autotuner::Finished()
 {
   switch (algo)
-    {
+  {
     case 1:
       if (workload < 30)
-	{
-	  workload = best_workload;
-	  return stopTuning = true;
-	}
+      {
+        workload = best_workload;
+        return stopTuning = true;
+      }
       break;
     case 2:
-      if (best_workload != 0) return stopTuning = true;
+      if (best_workload != 0)
+        return stopTuning = true;
       break;
     case 3:
-      if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
+      if ((c - b == limit) && (b - a == limit))
+        return stopTuning = true;
       break;
     default: /* Should never happen */;
-    }
+  }
   return false;
 }
 
 void Autotuner::Tune(double compTime)
 {
   switch (algo)
-    {
-    case 1: AlgoSimple(compTime); break;
-    case 2: AlgoRatio(compTime); break;
-    case 3: AlgoBisection(compTime); break;
+  {
+    case 1:
+      AlgoSimple(compTime);
+      break;
+    case 2:
+      AlgoRatio(compTime);
+      break;
+    case 3:
+      AlgoBisection(compTime);
+      break;
     default: /* Should never happen */;
-    }
+  }
 }
 
 void Autotuner::AlgoSimple(double compTime)
 {
   if (best_time == 0. || compTime < best_time)
-    {
-      best_time = compTime;
-      best_workload = workload;
-    }
+  {
+    best_time = compTime;
+    best_workload = workload;
+  }
 
   workload -= 5;
 }
@@ -81,46 +104,46 @@ void Autotuner::AlgoSimple(double compTime)
 void Autotuner::AlgoRatio(double compTime)
 {
   if (best_time == 0.)
-    {
-      best_time = compTime;
-      workload = 1;
-    }
+  {
+    best_time = compTime;
+    workload = 1;
+  }
   else
-    {
-      best_workload = (int) 100 * (compTime / (best_time + compTime));
-      workload = best_workload;
-    }
+  {
+    best_workload = (int) 100 * (compTime / (best_time + compTime));
+    workload = best_workload;
+  }
 }
 
 void Autotuner::AlgoBisection(double compTime)
 {
   if (fb == 0.)
-    {
-      fb = compTime;
-      x = 75;
-      workload = x;
-      return;
-    }
+  {
+    fb = compTime;
+    x = 75;
+    workload = x;
+    return;
+  }
 
   fx = compTime;
 
   if (fx < fb)
-    {
-      if (x < b)
-	c = b;
-      else
-	a = b;
-      b = x;
-      fb = fx;
-    }
+  {
+    if (x < b)
+      c = b;
+    else
+      a = b;
+    b = x;
+    fb = fx;
+  }
   else
-    {
-      if (x < b)
-	a = x;
-      else
-	c = x;
-    }
-
-  x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
+  {
+    if (x < b)
+      a = x;
+    else
+      c = x;
+  }
+
+  x = (c - b > b - a) ? (int) (b + (c - b) / 2) : (int) (a + (b - a + 1) / 2);
   workload = x;
 }
diff --git a/bioem.cpp b/bioem.cpp
index 8b5d284dd4017df08d3a40dfd01022681bb6b547..1cf3cd02b2b1df89836a110a01fe46eaf7b41b6b 100644
--- a/bioem.cpp
+++ b/bioem.cpp
@@ -1,414 +1,538 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-	Volker Lindenstruth and Gerhard Hummer.
-
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
 #ifdef WITH_MPI
 #include <mpi.h>
 
-#define MPI_CHK(expr)							\
-  if (expr != MPI_SUCCESS)						\
-    {									\
-      fprintf(stderr, "Error in MPI function %s: %d\n", __FILE__, __LINE__); \
-    }
+#define MPI_CHK(expr)                                                          \
+  if (expr != MPI_SUCCESS)                                                     \
+  {                                                                            \
+    fprintf(stderr, "Error in MPI function %s: %d\n", __FILE__, __LINE__);     \
+  }
 #endif
 
+#include "MersenneTwister.h"
+#include <algorithm>
+#include <cmath>
 #include <fstream>
-#include <boost/program_options.hpp>
-#include <boost/random/normal_distribution.hpp>
-#include <boost/random/uniform_int_distribution.hpp>
-#include <boost/random/mersenne_twister.hpp>
+#include <getopt.h>
 #include <iostream>
-#include <algorithm>
 #include <iterator>
+#include <queue>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <string>
-#include <cmath>
+#include <vector>
 
 #ifdef WITH_OPENMP
 #include <omp.h>
 #endif
 
+#include "autotuner.h"
+#include "timer.h"
 #include <fftw3.h>
 #include <math.h>
-#include "timer.h"
-#include "autotuner.h"
 
-#include "param.h"
 #include "bioem.h"
-#include "model.h"
 #include "map.h"
+#include "model.h"
+#include "param.h"
 
 #ifdef BIOEM_USE_NVTX
 #include "nvToolsExt.h"
 
-const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
-const int num_colors = sizeof(colors)/sizeof(colors[0]);
-enum myColor { COLOR_PROJECTION, COLOR_CONVOLUTION, COLOR_COMPARISON, COLOR_WORKLOAD, COLOR_INIT };
+const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
+                           0x0000ffff, 0x00ff0000, 0x00ffffff};
+const int num_colors = sizeof(colors) / sizeof(colors[0]);
+enum myColor
+{
+  COLOR_PROJECTION,
+  COLOR_CONVOLUTION,
+  COLOR_COMPARISON,
+  COLOR_WORKLOAD,
+  COLOR_INIT
+};
 
 // Projection number is stored in category attribute
 // Convolution number is stored in payload attribute
 
-#define cuda_custom_timeslot(name,iMap,iConv,cid) {	\
-    int color_id = cid;					\
-    color_id = color_id%num_colors;			\
-    nvtxEventAttributes_t eventAttrib = {0};		\
-    eventAttrib.version = NVTX_VERSION;			\
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;	\
-    eventAttrib.colorType = NVTX_COLOR_ARGB;		\
-    eventAttrib.color = colors[color_id];		\
-    eventAttrib.category = iMap;			\
-    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64; \
-    eventAttrib.payload.llValue = iConv;		\
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;	\
-    eventAttrib.message.ascii = name;			\
-    nvtxRangePushEx(&eventAttrib);			\
+#define cuda_custom_timeslot(name, iMap, iConv, cid)                           \
+  {                                                                            \
+    int color_id = cid;                                                        \
+    color_id = color_id % num_colors;                                          \
+    nvtxEventAttributes_t eventAttrib = {0};                                   \
+    eventAttrib.version = NVTX_VERSION;                                        \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                          \
+    eventAttrib.colorType = NVTX_COLOR_ARGB;                                   \
+    eventAttrib.color = colors[color_id];                                      \
+    eventAttrib.category = iMap;                                               \
+    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_UNSIGNED_INT64;                \
+    eventAttrib.payload.llValue = iConv;                                       \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                         \
+    eventAttrib.message.ascii = name;                                          \
+    nvtxRangePushEx(&eventAttrib);                                             \
   }
 #define cuda_custom_timeslot_end nvtxRangePop();
 #else
-#define cuda_custom_timeslot(name,iMap,iConv,cid)
+#define cuda_custom_timeslot(name, iMap, iConv, cid)
 #define cuda_custom_timeslot_end
 #endif
 
 #include "bioem_algorithm.h"
 
-using namespace boost;
-namespace po = boost::program_options;
-namespace bran= boost::random;
-
 using namespace std;
 
-/* For dvl nodes in hydra with problem in boost
-   namespace std {
-   typedef decltype(nullptr) nullptr_t;
-   }*/
-
-// A helper function of Boost
-template<class T>
-ostream& operator<<(ostream& os, const vector<T>& v)
-{
-  copy(v.begin(), v.end(), ostream_iterator<T>(os, " "));
-  return os;
-}
-
 bioem::bioem()
 {
-  FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
-  DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 0 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
-  nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
+  BioEMAlgo = getenv("BIOEM_ALGO") == NULL ? 1 : atoi(getenv("BIOEM_ALGO"));
+
+  DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ?
+                    0 :
+                    atoi(getenv("BIOEM_DEBUG_OUTPUT"));
+
+  if (getenv("BIOEM_PROJ_CONV_AT_ONCE") != NULL)
+  {
+    nProjectionsAtOnce = atoi(getenv("BIOEM_PROJ_CONV_AT_ONCE"));
+    if (BioEMAlgo == 1 && getenv("GPU") && atoi(getenv("GPU")) &&
+        nProjectionsAtOnce > 1)
+    {
+      printf("Warning: using parallel convolutions with GPUs can create race "
+             "condition and lead to inaccurate results. "
+             "BIOEM_PROJ_CONV_AT_ONCE is going to be set 1.\n");
+      nProjectionsAtOnce = 1;
+    }
+  }
+  else if (BioEMAlgo == 1)
+    nProjectionsAtOnce = 1;
+  else
+    nProjectionsAtOnce =
+        getenv("OMP_NUM_THREADS") == NULL ? 1 : atoi(getenv("OMP_NUM_THREADS"));
+
+  if (getenv("BIOEM_CUDA_THREAD_COUNT") != NULL)
+    CudaThreadCount = atoi(getenv("BIOEM_CUDA_THREAD_COUNT"));
+  else if (BioEMAlgo == 1)
+    CudaThreadCount = CUDA_THREAD_COUNT_ALGO1;
+  else
+    CudaThreadCount = CUDA_THREAD_COUNT_ALGO2;
+
   Autotuning = false;
 }
 
-bioem::~bioem()
+bioem::~bioem() {}
+
+void bioem::printOptions(myoption_t *myoptions, int myoptions_length)
 {
+  printf("\nCommand line inputs:\n");
+
+  // Find longest column width
+  int maxlen = 0;
+  for (int i = 0; i < myoptions_length; i++)
+  {
+    if (myoptions[i].hidden)
+      continue;
+    if (maxlen < strlen(myoptions[i].name))
+      maxlen = strlen(myoptions[i].name);
+  }
+
+  for (int i = 0; i < myoptions_length; i++)
+  {
+    if (myoptions[i].hidden)
+      continue;
+    printf("  --%-*s", maxlen, myoptions[i].name);
+    if (myoptions[i].arg == required_argument)
+      printf(" arg");
+    else
+      printf("    ");
+    printf(" %s\n", myoptions[i].desc);
+  }
+  printf("\n");
 }
 
-int bioem::configure(int ac, char* av[])
+int bioem::readOptions(int ac, char *av[])
 {
-  // **************************************************************************************
-  // **** Configuration Routine using boost for extracting parameters, models and maps ****
-  // **************************************************************************************
-  // ****** And Precalculating necessary grids, map crosscorrelations and kernels  ********
-  // *************************************************************************************
-
   HighResTimer timer;
 
+  // *** Inizialzing default variables ***
+  std::string infile, modelfile, mapfile, Inputanglefile, Inputbestmap;
+  Model.readPDB = false;
+  param.param_device.writeAngles = 0;
+  param.dumpMap = false;
+  param.loadMap = false;
+  param.printModel = false;
+  RefMap.readMRC = false;
+  RefMap.readMultMRC = false;
+  param.notuniformangles = false;
+  OutfileName = "Output_Probabilities";
+
+  cout << " ++++++++++++ FROM COMMAND LINE +++++++++++\n\n";
+
+  // Write your options here
+  myoption_t myoptions[] = {
+      {"Modelfile", required_argument, "(Mandatory) Name of model file", false},
+      {"Particlesfile", required_argument,
+       "(Mandatory) Name of particle-image file", false},
+      {"Inputfile", required_argument,
+       "(Mandatory) Name of input parameter file", false},
+      {"PrintBestCalMap", required_argument,
+       "(Optional) Only print best calculated map. NO BioEM!", true},
+      {"ReadOrientation", required_argument,
+       "(Optional) Read file name containing orientations", false},
+      {"ReadPDB", no_argument, "(Optional) If reading model file in PDB format",
+       false},
+      {"ReadMRC", no_argument,
+       "(Optional) If reading particle file in MRC format", false},
+      {"ReadMultipleMRC", no_argument, "(Optional) If reading Multiple MRCs",
+       false},
+      {"DumpMaps", no_argument,
+       "(Optional) Dump maps after they were read from particle-image file",
+       false},
+      {"LoadMapDump", no_argument, "(Optional) Read Maps from dump option",
+       false},
+      {"OutputFile", required_argument,
+       "(Optional) For changing the outputfile name", false},
+      {"help", no_argument, "(Optional) Produce help message", false}};
+  int myoptions_length = sizeof(myoptions) / sizeof(myoption_t);
+
+  // If not all Mandatory parameters are defined
+  if ((ac < 2))
+  {
+    printf("Error: Need to specify all mandatory options\n");
+    printOptions(myoptions, myoptions_length);
+    return 1;
+  }
 
+  // Creating options structure for getopt_long()
+  struct option *long_options =
+      (option *) calloc((myoptions_length + 1), sizeof(option));
+  for (int i = 0; i < myoptions_length; i++)
+  {
+    long_options[i].name = myoptions[i].name;
+    long_options[i].has_arg = myoptions[i].arg;
+  }
 
-  std::string infile, modelfile, mapfile,Inputanglefile,Inputbestmap; 
-  if (mpi_rank == 0)
-    {
-      // *** Inizialzing default variables ***
-      std::string infile, modelfile, mapfile,Inputanglefile,Inputbestmap;
-      Model.readPDB = false;
-      param.param_device.writeAngles = false;
-      param.dumpMap = false;
-      param.loadMap = false;
-      RefMap.readMRC = false;
-      RefMap.readMultMRC = false;
-      param.notuniformangles=false;
-      yesoutfilename=false;
-
-      // *************************************************************************************
-      cout << " ++++++++++++ FROM COMMAND LINE +++++++++++\n\n";
-      // *************************************************************************************
-
-      // ********************* Command line reading input with BOOST ************************
-
-      try {
-	po::options_description desc("Command line inputs");
-	desc.add_options()
-	  ("Modelfile", po::value< std::string>() , "(Mandatory) Name of model file")
-	  ("Particlesfile", po::value< std::string>(), "if BioEM (Mandatory) Name of particle-image file")
-	  ("Inputfile", po::value<std::string>(), "if BioEM (Mandatory) Name of input parameter file") 
-	  ("PrintBestCalMap", po::value< std::string>(), "(Optional) Only print best calculated map. NO BioEM (!)")
-	  ("ReadOrientation", po::value< std::string>(), "(Optional) Read file name containing orientations")
-	  ("ReadPDB", "(Optional) If reading model file in PDB format")
-	  ("ReadMRC", "(Optional) If reading particle file in MRC format")
-	  ("ReadMultipleMRC", "(Optional) If reading Multiple MRCs")
-	  ("DumpMaps", "(Optional) Dump maps after they were read from particle-image file")
-	  ("LoadMapDump", "(Optional) Read Maps from dump option")
-	  ("OutputFile",  po::value< std::string>(), "(Optional) For changing the outputfile name")
-	  ("help", "(Optional) Produce help message")
-	  ;
-
-
-	po::positional_options_description p;
-	p.add("Inputfile", -1);
-	p.add("Modelfile", -1);
-	p.add("Particlesfile", -1);
-	p.add("ReadPDB", -1);
-	p.add("ReadMRC", -1);
-	p.add("ReadMultipleMRC", -1);
-	p.add("ReadOrientation",-1);
-	p.add("PrintBestCalMap",-1);
-	p.add("DumpMaps", -1);
-	p.add("LoadMapDump", -1);
-        p.add("OutputFile",-1);
-
-	po::variables_map vm;
-	po::store(po::command_line_parser(ac, av).
-		  options(desc).positional(p).run(), vm);
-	po::notify(vm);
-
-	if((ac < 4)) {
-	  std::cout << desc << std::endl;
-	  return 1;
-	}
-	if (vm.count("help")) {
-	  cout << "Usage: options_description [options]\n";
-	  cout << desc;
-	  return 1;
-	}
-
-	if (vm.count("Inputfile"))
-	  {
-	    cout << "Input file is: ";
-	    cout << vm["Inputfile"].as< std::string >() << "\n";
-	    infile = vm["Inputfile"].as< std::string >();
-	  }
-	if (vm.count("Modelfile"))
-	  {
-	    cout << "Model file is: "
-		 << vm["Modelfile"].as<  std::string  >() << "\n";
-	    modelfile = vm["Modelfile"].as<  std::string  >();
-	  }
-	if (vm.count("ReadPDB"))
-	  {
-	    cout << "Reading model file in PDB format.\n";
-	    Model.readPDB = true;
-	  }
-	if (vm.count("ReadOrientation"))
-	  {
-	    cout << "Reading Orientation from file: "
-		 << vm["ReadOrientation"].as<  std::string  >() << "\n";
-	    cout << "Important! if using Quaternions, include \n";
-	    cout << "QUATERNIONS keyword in INPUT PARAMETER FILE\n";
-	    cout << "First row in file should be the total number of orientations (int)\n";
-	    cout << "Euler angle format should be alpha (12.6f) beta (12.6f) gamma (12.6f)\n";
-	    cout << "Quaternion format q1 (12.6f) q2 (12.6f) q3 (12.6f) q4 (12.6f)\n";
-	    Inputanglefile = vm["ReadOrientation"].as<  std::string  >();
-	    param.notuniformangles=true;
-	  }
-	if (vm.count("OutputFile"))
-          {
-	    OutfileName = vm["OutputFile"].as< std::string >();
-	    cout << "Writing OUTPUT to: " <<  vm["OutputFile"].as<  std::string  >() << "\n";
-	    yesoutfilename=true;
-	  }
-	if (vm.count("PrintBestCalMap"))
-	  {
-	    cout << "Reading Euler Angles from file: "
-		 << vm["PrintBestCalMap"].as<  std::string  >() << "\n";
-	    Inputbestmap = vm["PrintBestCalMap"].as<  std::string  >();
-	    param.printModel=true;
-	  }
-
-	if (vm.count("ReadMRC"))
-	  {
-	    cout << "Reading particle file in MRC format.\n";
-	    RefMap.readMRC=true;
-	  }
-
-	if (vm.count("ReadMultipleMRC"))
-	  {
-	    cout << "Reading Multiple MRCs.\n";
-	    RefMap.readMultMRC=true;
-	  }
-
-	if (vm.count("DumpMaps"))
-	  {
-	    cout << "Dumping Maps after reading from file.\n";
-	    param.dumpMap = true;
-	  }
-
-	if (vm.count("LoadMapDump"))
-	  {
-	    cout << "Loading Map dump.\n";
-	    param.loadMap = true;
-	  }
-
-	if (vm.count("Particlesfile"))
-	  {
-	    cout << "Paricle file is: "
-		 << vm["Particlesfile"].as< std::string >() << "\n";
-	    mapfile = vm["Particlesfile"].as< std::string >();
-	  }
-      }
-      catch(std::exception& e)
-	{
-	  cout << e.what() << "\n";
-	  return 1;
-	}
-
-      //check for consitency in multiple MRCs
-      if(RefMap.readMultMRC && not(RefMap.readMRC))
-	{
-	  cout << "For Multiple MRCs command --ReadMRC is necesary too";
-	  exit(1);
-	}
-
-      if(!Model.readPDB){
-	cout << "Note: Reading model in simple text format (not PDB)\n";
-	cout << "----  x   y   z  radius  density ------- \n";
-      } 
-
-      if (DebugOutput >= 2 && mpi_rank == 0) timer.ResetStart();
-      // ********************* Reading Parameter Input ***************************
-      if(!param.printModel){
-	// Standard definition for BioEM
-	param.readParameters(infile.c_str());
-
-	// ********************* Reading Particle Maps Input **********************
-	RefMap.readRefMaps(param, mapfile.c_str());
-
-
-      } else{
-	// Reading parameters for only writting down Best projection
-
-	param.forprintBest(Inputbestmap.c_str());
-      }	
-
-      // ********************* Reading Model Input ******************************
-      Model.readModel(param, modelfile.c_str());
-
-      cout << "**NOTE:: look at file COORDREAD to confirm that the Model coordinates are correct\n";
-
-      if (DebugOutput >= 2 && mpi_rank == 0) printf("Reading Input Data Time: %f\n", timer.GetCurrentElapsedTime());
-    
-      if(param.param_device.writeCC && mpi_size>1){
-	cout << "Exiting::: WRITE CROSS-CORRELATION ONLY VAILD FOR 1 MPI PROCESS\n";
-        exit(1);
-      }
+  int myopt;
+  while (1)
+  {
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+    myopt = getopt_long(ac, av, "", long_options, &option_index);
+
+    /* Detect the end of the options. */
+    if (myopt == -1)
+      break;
 
-      // Generating Grids of orientations 
-      if(!param.printModel)param.CalculateGridsParam(Inputanglefile.c_str());
+    switch (myopt)
+    {
+      case 0:
+#ifdef DEBUG
+        printf("option %s", long_options[option_index].name);
+        if (optarg)
+          printf(" with arg %s", optarg);
+        printf("\n");
+#endif
+        // Here write actions for each option
+        if (!strcmp(long_options[option_index].name, "help"))
+        {
+          cout << "Usage: options_description [options]\n";
+          printOptions(myoptions, myoptions_length);
+          return 1;
+        }
+        if (!strcmp(long_options[option_index].name, "Inputfile"))
+        {
+          cout << "Input file is: " << optarg << "\n";
+          infile = optarg;
+        }
+        if (!strcmp(long_options[option_index].name, "Modelfile"))
+        {
+          cout << "Model file is: " << optarg << "\n";
+          modelfile = optarg;
+        }
+        if (!strcmp(long_options[option_index].name, "ReadPDB"))
+        {
+          cout << "Reading model file in PDB format.\n";
+          Model.readPDB = true;
+        }
+        if (!strcmp(long_options[option_index].name, "ReadOrientation"))
+        {
+          cout << "Reading Orientation from file: " << optarg << "\n";
+          cout << "Important! if using Quaternions, include \n";
+          cout << "QUATERNIONS keyword in INPUT PARAMETER FILE\n";
+          cout << "First row in file should be the total number of "
+                  "orientations "
+                  "(int)\n";
+          cout << "Euler angle format should be alpha (12.6f) beta (12.6f) "
+                  "gamma (12.6f)\n";
+          cout << "Quaternion format q1 (12.6f) q2 (12.6f) q3 (12.6f) q4 "
+                  "(12.6f)\n";
+          Inputanglefile = optarg;
+          param.notuniformangles = true;
+        }
+        if (!strcmp(long_options[option_index].name, "OutputFile"))
+        {
+          cout << "Writing OUTPUT to: " << optarg << "\n";
+          OutfileName = optarg;
+        }
+        if (!strcmp(long_options[option_index].name, "PrintBestCalMap"))
+        {
+          cout << "Reading Best Parameters from file: " << optarg << "\n";
+          Inputbestmap = optarg;
+          param.printModel = true;
+        }
+        if (!strcmp(long_options[option_index].name, "ReadMRC"))
+        {
+          cout << "Reading particle file in MRC format.\n";
+          RefMap.readMRC = true;
+        }
+        if (!strcmp(long_options[option_index].name, "ReadMultipleMRC"))
+        {
+          cout << "Reading Multiple MRCs.\n";
+          RefMap.readMultMRC = true;
+        }
+        if (!strcmp(long_options[option_index].name, "DumpMaps"))
+        {
+          cout << "Dumping Maps after reading from file.\n";
+          param.dumpMap = true;
+        }
+        if (!strcmp(long_options[option_index].name, "LoadMapDump"))
+        {
+          cout << "Loading Map dump.\n";
+          param.loadMap = true;
+        }
+        if (!strcmp(long_options[option_index].name, "Particlesfile"))
+        {
+          cout << "Particle file is: " << optarg << "\n";
+          mapfile = optarg;
+        }
+        break;
+      case '?':
+        /* getopt_long already printed an error message. */
+        printOptions(myoptions, myoptions_length);
+        return 1;
+      default:
+        abort();
     }
+  }
+  /* Print any remaining command line arguments (not options) and exit */
+  if (optind < ac)
+  {
+    printf("Error: non-option ARGV-elements: ");
+    while (optind < ac)
+      printf("%s ", av[optind++]);
+    putchar('\n');
+    printOptions(myoptions, myoptions_length);
+    return 1;
+  }
 
-#ifdef WITH_MPI
+  // check for consitency in multiple MRCs
+  if (RefMap.readMultMRC && not(RefMap.readMRC))
+  {
+    cout << "For Multiple MRCs command --ReadMRC is necesary too";
+    exit(1);
+  }
 
+  if (!Model.readPDB)
+  {
+    cout << "Note: Reading model in simple text format (not PDB)\n";
+    cout << "----  x   y   z  radius  density ------- \n";
+  }
 
+  if (DebugOutput >= 2 && mpi_rank == 0)
+    timer.ResetStart();
 
-  // ********************* MPI inizialization/ Transfer of parameters******************
-  if (mpi_size > 1)
-    {
-      if (DebugOutput >= 2 && mpi_rank == 0) timer.ResetStart();
-      MPI_Bcast(&param, sizeof(param), MPI_BYTE, 0, MPI_COMM_WORLD);
-      //We have to reinitialize all pointers !!!!!!!!!!!!
-      if (mpi_rank != 0) param.angprior = NULL;
+  // *** Reading Parameter Input ***
+  if (!param.printModel)
+  {
+    // Standard definition for BioEM
+    param.readParameters(infile.c_str());
+    // *** Reading Particle Maps Input ***
+    RefMap.readRefMaps(param, mapfile.c_str());
+  }
+  else
+  {
+    // Reading parameters for only writting down Best projection
+    param.forprintBest(Inputbestmap.c_str());
+  }
 
-      if (mpi_rank != 0)param.angles =  (myfloat3_t*) mallocchk(param.nTotGridAngles  * sizeof (myfloat3_t));
-      MPI_Bcast(param.angles, param.nTotGridAngles  * sizeof (myfloat3_t),MPI_BYTE, 0, MPI_COMM_WORLD);
+  // *** Reading Model Input ***
+  Model.readModel(param, modelfile.c_str());
 
-#ifdef DEBUG
-      for(int n=0;n<param.nTotGridAngles;n++){
-	cout << "CHECK: Angle orient " << mpi_rank << " "<< n << " " <<  param.angles[n].pos[0] << " " <<  param.angles[n].pos[1] << " " << param.angles[n].pos[2] << " " << param.angles[n].quat4  << " " << "\n";} 
+  cout << "**NOTE:: look at file COORDREAD to confirm that the Model "
+          "coordinates are correct\n";
 
-#endif
-      //****refCtf, CtfParam, angles automatically filled by precalculate function bellow
+  if (DebugOutput >= 2 && mpi_rank == 0)
+    printf("Reading Input Data Time: %f\n", timer.GetCurrentElapsedTime());
+
+  // Generating Grids of orientations
+  if (!param.printModel)
+    param.CalculateGridsParam(Inputanglefile.c_str());
+
+  return (0);
+}
+
+int bioem::configure(int ac, char *av[])
+{
+  // **************************************************************************************
+  // **** Configuration Routine using getopts for extracting parameters, models
+  // and maps ****
+  // **************************************************************************************
+  // ****** And Precalculating necessary grids, map crosscorrelations and
+  // kernels  ********
+  // *************************************************************************************
+
+  HighResTimer timer;
+
+  if (mpi_rank == 0 && readOptions(ac, av))
+    return 1;
+
+#ifdef WITH_MPI
 
-      MPI_Bcast(&Model, sizeof(Model), MPI_BYTE, 0, MPI_COMM_WORLD);
-      if (mpi_rank != 0) Model.points = (bioem_model::bioem_model_point*) mallocchk(sizeof(bioem_model::bioem_model_point) * Model.nPointsModel);
-      MPI_Bcast(Model.points, sizeof(bioem_model::bioem_model_point) * Model.nPointsModel, MPI_BYTE, 0, MPI_COMM_WORLD);
+  // ********************* MPI inizialization/ Transfer of
+  // parameters******************
+  if (mpi_size > 1)
+  {
+    if (DebugOutput >= 2 && mpi_rank == 0)
+      timer.ResetStart();
+    MPI_Bcast(&param, sizeof(param), MPI_BYTE, 0, MPI_COMM_WORLD);
+    // We have to reinitialize all pointers !!!!!!!!!!!!
+    if (mpi_rank != 0)
+      param.angprior = NULL;
 
-      MPI_Bcast(&RefMap, sizeof(RefMap), MPI_BYTE, 0, MPI_COMM_WORLD);
-      if (mpi_rank != 0) RefMap.maps = (myfloat_t*) mallocchk(RefMap.refMapSize * sizeof(myfloat_t) * RefMap.ntotRefMap);
-      MPI_Bcast(RefMap.maps, RefMap.refMapSize * sizeof(myfloat_t) * RefMap.ntotRefMap, MPI_BYTE, 0, MPI_COMM_WORLD);
-      if (DebugOutput >= 2 && mpi_rank == 0) printf("MPI Broadcast of Input Data %f\n", timer.GetCurrentElapsedTime());
+    if (mpi_rank != 0)
+      param.angles =
+          (myfloat3_t *) mallocchk(param.nTotGridAngles * sizeof(myfloat3_t));
+    MPI_Bcast(param.angles, param.nTotGridAngles * sizeof(myfloat3_t), MPI_BYTE,
+              0, MPI_COMM_WORLD);
 
+#ifdef DEBUG
+    for (int n = 0; n < param.nTotGridAngles; n++)
+    {
+      cout << "CHECK: Angle orient " << mpi_rank << " " << n << " "
+           << param.angles[n].pos[0] << " " << param.angles[n].pos[1] << " "
+           << param.angles[n].pos[2] << " " << param.angles[n].quat4 << " "
+           << "\n";
     }
+
+#endif
+    //****refCtf, CtfParam, angles automatically filled by precalculate function
+    // bellow
+
+    MPI_Bcast(&Model, sizeof(Model), MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (mpi_rank != 0)
+      Model.points = (bioem_model::bioem_model_point *) mallocchk(
+          sizeof(bioem_model::bioem_model_point) * Model.nPointsModel);
+    MPI_Bcast(Model.points,
+              sizeof(bioem_model::bioem_model_point) * Model.nPointsModel,
+              MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    MPI_Bcast(&RefMap, sizeof(RefMap), MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (mpi_rank != 0)
+      RefMap.maps = (myfloat_t *) mallocchk(
+          RefMap.refMapSize * sizeof(myfloat_t) * RefMap.ntotRefMap);
+    MPI_Bcast(RefMap.maps,
+              RefMap.refMapSize * sizeof(myfloat_t) * RefMap.ntotRefMap,
+              MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (DebugOutput >= 2 && mpi_rank == 0)
+      printf("MPI Broadcast of Input Data %f\n", timer.GetCurrentElapsedTime());
+  }
 #endif
 
   // ****************** Precalculating Necessary Stuff *********************
-  if (DebugOutput >= 2 && mpi_rank == 0) timer.ResetStart();
+  if (DebugOutput >= 2 && mpi_rank == 0)
+    timer.ResetStart();
   param.PrepareFFTs();
 
   if (DebugOutput >= 2 && mpi_rank == 0)
-    {
-      printf("Time Prepare FFTs %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
+  {
+    printf("Time Prepare FFTs %f\n", timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
   precalculate();
 
   // ****************** For debugging *********************
   if (getenv("BIOEM_DEBUG_BREAK"))
-    {
-      const int cut = atoi(getenv("BIOEM_DEBUG_BREAK"));
-      if (param.nTotGridAngles > cut) param.nTotGridAngles = cut;
-      if (param.nTotCTFs > cut) param.nTotCTFs = cut;
-    }
+  {
+    const int cut = atoi(getenv("BIOEM_DEBUG_BREAK"));
+    if (param.nTotGridAngles > cut)
+      param.nTotGridAngles = cut;
+    if (param.nTotCTFs > cut)
+      param.nTotCTFs = cut;
+  }
 
   if (DebugOutput >= 2 && mpi_rank == 0)
-    {
-      printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
+  {
+    printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
+
+  // Number of parallel Convolutions and Comparisons
+  param.nTotParallelConv = min(param.nTotCTFs, nProjectionsAtOnce);
 
   // ****************** For autotuning **********************
-  if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
-    {
-      Autotuning = true;
-      if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
-    }
+  if ((getenv("GPU") && atoi(getenv("GPU"))) && (BioEMAlgo == 1) &&
+      ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) &&
+      (!getenv("BIOEM_DEBUG_BREAK") ||
+       (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
+  {
+    Autotuning = true;
+    if (mpi_rank == 0)
+      printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm "
+             "%d\n\tRecalibration at every %d projections\n\tComparisons are "
+             "considered stable after first %d comparisons\n",
+             AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
+  }
   else
+  {
+    Autotuning = false;
+    if (mpi_rank == 0)
     {
-      Autotuning = false;
-      if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n");
+      printf("Autotuning of GPUWorkload disabled");
+      if (getenv("GPU") && atoi(getenv("GPU")))
+        printf(", using GPUWorkload: %d%%\n",
+               (getenv("GPUWORKLOAD") && (atoi(getenv("GPUWORKLOAD")) != -1)) ?
+                   atoi(getenv("GPUWORKLOAD")) :
+                   100);
+      else
+        printf(", please enable GPUs\n");
     }
+  }
 
   // ****************** Initializing pointers *********************
 
   deviceInit();
 
   if (DebugOutput >= 2 && mpi_rank == 0)
-    {
-      printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
+  {
+    printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
 
-  if(!param.printModel)pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, *this);
+  if (!param.printModel)
+    pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, *this);
 
   if (DebugOutput >= 2 && mpi_rank == 0)
-    {
-      printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
+  {
+    printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
 
-  return(0);
+  return (0);
 }
 
 void bioem::cleanup()
 {
-  //Deleting allocated pointers
+  // Deleting allocated pointers
   free_device_host(pProb.ptr);
   RefMap.freePointers();
 }
@@ -416,723 +540,1019 @@ void bioem::cleanup()
 int bioem::precalculate()
 {
   // **************************************************************************************
-  // **Precalculating Routine of Orientation grids, Map crosscorrelations and CTF Kernels**
+  // **Precalculating Routine of Orientation grids, Map crosscorrelations and
+  // CTF Kernels**
   // **************************************************************************************
   HighResTimer timer;
-  if (DebugOutput >= 3)
-    {
-      printf("\tTime Precalculate Grids Param: %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
+  if (DebugOutput >= 2)
+  {
+    printf("\tTime Precalculate Grids Param: %f\n",
+           timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
   // Precalculating CTF Kernels stored in class Param
   param.CalculateRefCTF();
 
-  if (DebugOutput >= 3)
-    {
-      printf("\tTime Precalculate CTFs: %f\n", timer.GetCurrentElapsedTime());
-      timer.ResetStart();
-    }
-  //Precalculate Maps
-  if(!param.printModel) RefMap.precalculate(param, *this);
-  if (DebugOutput >= 3) printf("\tTime Precalculate Maps: %f\n", timer.GetCurrentElapsedTime());
+  if (DebugOutput >= 2)
+  {
+    printf("\tTime Precalculate CTFs: %f\n", timer.GetCurrentElapsedTime());
+    timer.ResetStart();
+  }
+  // Precalculate Maps
+  if (!param.printModel)
+    RefMap.precalculate(param, *this);
+  if (DebugOutput >= 2)
+    printf("\tTime Precalculate Maps: %f\n", timer.GetCurrentElapsedTime());
 
-  return(0);
+  return (0);
 }
 
-int bioem::run()
+int bioem::printModel()
 {
-
   // **************************************************************************************
-  // ********** Secondary routine for printing out the only best projection ***************
+  // ********** Secondary routine for printing out the only best projection
+  // ***************
   // **************************************************************************************
 
-  if(mpi_rank == 0 && param.printModel){ //Only works for 1 MPI process (not parallelized)
-
-    cout << "\nAnalysis for printing best projection::: \n \n" ; 
-    mycomplex_t* proj_mapsFFT;
-    myfloat_t* conv_map = NULL;
-    mycomplex_t* conv_mapFFT;
-    myfloat_t sumCONV, sumsquareCONV;
+  cout << "\nAnalysis for printing best projection::: \n \n";
+  mycomplex_t *proj_mapsFFT;
+  myfloat_t *conv_map = NULL;
+  mycomplex_t *conv_mapFFT;
+  myfloat_t sumCONV, sumsquareCONV;
 
-    proj_mapsFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
-    conv_mapFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
-    conv_map = (myfloat_t*) myfftw_malloc(sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
+  proj_mapsFFT = (mycomplex_t *) myfftw_malloc(
+      sizeof(mycomplex_t) * param.param_device.NumberPixels *
+      param.param_device.NumberFFTPixels1D);
+  conv_mapFFT = (mycomplex_t *) myfftw_malloc(
+      sizeof(mycomplex_t) * param.param_device.NumberPixels *
+      param.param_device.NumberFFTPixels1D);
+  conv_map = (myfloat_t *) myfftw_malloc(sizeof(myfloat_t) *
+                                         param.param_device.NumberPixels *
+                                         param.param_device.NumberPixels);
 
-    cout << "...... Calculating Projection .......................\n " ;
+  cout << "...... Calculating Projection .......................\n ";
 
-    createProjection(0, proj_mapsFFT);
+  createProjection(0, proj_mapsFFT);
 
-    cout << "...... Calculating Convolution .......................\n " ;
+  cout << "...... Calculating Convolution .......................\n ";
 
-    createConvolutedProjectionMap(0, 0, proj_mapsFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
+  createConvolutedProjectionMap_noFFT(proj_mapsFFT, conv_map, conv_mapFFT,
+                                      sumCONV, sumsquareCONV);
 
-  }
+  return (0);
+}
 
+int bioem::run()
+{
   // **************************************************************************************
-  // **** Main BioEM routine, projects, convolutes and compares with Map using OpenMP ****
+  // **** Main BioEM routine, projects, convolutes and compares with Map using
+  // OpenMP ****
   // **************************************************************************************
 
-  // **** If we want to control the number of threads -> omp_set_num_threads(XX); ******
-  // ****************** Declarying class of Probability Pointer  *************************
+  // **** If we want to control the number of threads ->
+  // omp_set_num_threads(XX); ******
+  // ****************** Declarying class of Probability Pointer
+  // *************************
   cuda_custom_timeslot("Initialization", -1, -1, COLOR_INIT);
-  if (mpi_rank == 0) printf("\tInitializing Probabilities\n");
+  if (mpi_rank == 0)
+    printf("\tInitializing Probabilities\n");
 
   // Contros for MPI
-  if(mpi_size > param.nTotGridAngles){
-    cout << "EXIT: Wrong MPI setup More MPI processes than orientations\n"; exit(1);
+  if (mpi_size > param.nTotGridAngles)
+  {
+    cout << "EXIT: Wrong MPI setup More MPI processes than orientations\n";
+    exit(1);
   }
 
   // Inizialzing Probabilites to zero and constant to -Infinity
-  for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap ++)
-    {
-      bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
+  for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap++)
+  {
+    bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
 
-      pProbMap.Total = 0.0;
-      pProbMap.Constoadd = -FLT_MAX; //Problem if using double presicion
+    pProbMap.Total = 0.0;
+    pProbMap.Constoadd = MIN_PROB;
 
-      if (param.param_device.writeAngles)
-	{
-	  for (int iOrient = 0; iOrient < param.nTotGridAngles; iOrient ++)
-	    {
-	      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
-
-	      pProbAngle.forAngles = 0.0;
-	      pProbAngle.ConstAngle = -FLT_MAX;
-	    }
-	}
-
-      if (param.param_device.writeCC)
-	{      int  cc=0;
-	  for (int cent_x = 0; cent_x < param.param_device.NumberPixels; cent_x = cent_x + param.param_device.CCdisplace)
-	    {
-	      for (int cent_y = 0; cent_y < param.param_device.NumberPixels; cent_y = cent_y + param.param_device.CCdisplace)
-		{
-		  bioem_Probability_cc& pProbCC = pProb.getProbCC(iRefMap, cc);
-		  //Debuggin:: cout << iRefMap << " " << cc << " " << cent_x << " " << cent_y << "\n";
-
-		  if(!param.param_device.CCwithBayes) {
-		    pProbCC.forCC=-FLT_MAX;
-		  }else {
-		    pProbCC.forCC = 0.0;
-		    pProbCC.ConstCC=-FLT_MAX;
-		  }
-		  cc++;
-		}
-	    }
-	  if(!FFTAlgo){cout << "Cross correlation calculation must be with enviormental variable FFTALGO=1\n"; exit(1);}
-	}                 
-    }
+    if (param.param_device.writeAngles)
+    {
+      for (int iOrient = 0; iOrient < param.nTotGridAngles; iOrient++)
+      {
+        bioem_Probability_angle &pProbAngle =
+            pProb.getProbAngle(iRefMap, iOrient);
 
-  if(!FFTAlgo){cout << "Remark: Not using FFT algorithm. Not using Prior in B-Env.";}
+        pProbAngle.forAngles = 0.0;
+        pProbAngle.ConstAngle = MIN_PROB;
+      }
+    }
+  }
 
   // **************************************************************************************
 
   deviceStartRun();
 
-  // ******************************** MAIN CYCLE ******************************************
-
-  mycomplex_t* proj_mapsFFT;
-  myfloat_t* conv_map = NULL;
-  mycomplex_t* conv_mapFFT;
-  myfloat_t sumCONV, sumsquareCONV;
-
-  //allocating fftw_complex vector
-  const int ProjMapSize = (param.FFTMapSize + 64) & ~63;	//Make sure this is properly aligned for fftw..., Actually this should be ensureb by using FFTMapSize, but it is not due to a bug in CUFFT which cannot handle padding properly
-  //******** Alocating Vectors *************
-  proj_mapsFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * ProjMapSize * nProjectionsAtOnce);
-  conv_mapFFT = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
-  if (!FFTAlgo) conv_map = (myfloat_t*) myfftw_malloc(sizeof(myfloat_t) * param.param_device.NumberPixels * param.param_device.NumberPixels);
-
-  cuda_custom_timeslot_end; //Ending initialization
+  // ******************************** MAIN CYCLE
+  // ******************************************
+
+  mycomplex_t *proj_mapsFFT;
+  mycomplex_t *conv_mapsFFT;
+  myparam5_t *comp_params =
+      new myparam5_t[param.nTotParallelConv * PIPELINE_LVL];
+  int iPipeline = 0;
+
+  // allocating fftw_complex vector
+  const int ProjMapSize =
+      (param.FFTMapSize + 64) & ~63; // Make sure this is properly aligned for
+  // fftw..., Actually this should be ensureb by
+  // using FFTMapSize, but it is not due to a bug
+  // in CUFFT which cannot handle padding properly
+  //******** Allocating Vectors *************
+  proj_mapsFFT = (mycomplex_t *) myfftw_malloc(
+      sizeof(mycomplex_t) * ProjMapSize * nProjectionsAtOnce);
+  conv_mapsFFT =
+      (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param.FFTMapSize *
+                                    param.nTotParallelConv * PIPELINE_LVL);
+
+  cuda_custom_timeslot_end; // Ending initialization
 
   HighResTimer timer, timer2;
 
   /* Autotuning */
   Autotuner aut;
   if (Autotuning)
-    {
-      aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
-      rebalanceWrapper(aut.Workload());
-    }
-
-  if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
-
-
+  {
+    aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
+    rebalanceWrapper(aut.Workload());
+  }
 
-  const int iOrientStart = (int) ((long long int) mpi_rank * param.nTotGridAngles / mpi_size);
-  int iOrientEnd = (int) ((long long int) (mpi_rank + 1) * param.nTotGridAngles / mpi_size);
-  if (iOrientEnd > param.nTotGridAngles) iOrientEnd = param.nTotGridAngles;
+  if (DebugOutput >= 1 && mpi_rank == 0)
+    printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², "
+           "Pixels %d², OMP Threads %d, MPI Ranks %d\n",
+           param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap,
+           2 * param.param_device.maxDisplaceCenter +
+               param.param_device.GridSpaceCenter,
+           param.param_device.GridSpaceCenter, param.param_device.NumberPixels,
+           omp_get_max_threads(), mpi_size);
+
+  const int iOrientStart =
+      (int) ((long long int) mpi_rank * param.nTotGridAngles / mpi_size);
+  int iOrientEnd =
+      (int) ((long long int) (mpi_rank + 1) * param.nTotGridAngles / mpi_size);
+  if (iOrientEnd > param.nTotGridAngles)
+    iOrientEnd = param.nTotGridAngles;
 
   /* Vectors for computing statistic on different parts of the code */
   TimeStat ts((iOrientEnd - iOrientStart), param.nTotCTFs);
-  if (DebugOutput >= 1) ts.InitTimeStat(4);
+  if (DebugOutput >= 1)
+    ts.InitTimeStat(4);
 
-  // **************************Loop Over orientations***************************************
+  // **************************Loop Over
+  // orientations***************************************
 
-  for (int iOrientAtOnce = iOrientStart; iOrientAtOnce < iOrientEnd; iOrientAtOnce += nProjectionsAtOnce)
+  for (int iOrientAtOnce = iOrientStart; iOrientAtOnce < iOrientEnd;
+       iOrientAtOnce += nProjectionsAtOnce)
+  {
+    // ***************************************************************************************
+    // ***** Creating Projection for given orientation and transforming to
+    // Fourier space *****
+    if (DebugOutput >= 1)
     {
-      // ***************************************************************************************
-      // ***** Creating Projection for given orientation and transforming to Fourier space *****
-      if (DebugOutput >= 1)
-	{
-	  timer2.ResetStart();
-	  timer.ResetStart();
-	}
-      int iTmpEnd = std::min(iOrientEnd, iOrientAtOnce + nProjectionsAtOnce);
+      timer2.ResetStart();
+      timer.ResetStart();
+    }
+    int iOrientEndAtOnce =
+        std::min(iOrientEnd, iOrientAtOnce + nProjectionsAtOnce);
 
-      // **************************Parallel orientations for projections at once***************
+// **************************Parallel orientations for projections at
+// once***************
 
 #pragma omp parallel for
-      for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
-	{
-	  createProjection(iOrient, &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]);
-	}
+    for (int iOrient = iOrientAtOnce; iOrient < iOrientEndAtOnce; iOrient++)
+    {
+      createProjection(iOrient,
+                       &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize]);
+    }
+    if (DebugOutput >= 1)
+    {
+      ts.time = timer.GetCurrentElapsedTime();
+      ts.Add(TS_PROJECTION);
+      if (DebugOutput >= 2)
+        printf("\tTime Projection %d-%d: %f (rank %d)\n", iOrientAtOnce,
+               iOrientEndAtOnce - 1, ts.time, mpi_rank);
+    }
+    /* Recalibrate if needed */
+    if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) &&
+        ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) &&
+        (iOrientAtOnce != iOrientStart))
+    {
+      aut.Reset();
+      rebalanceWrapper(aut.Workload());
+    }
+
+    for (int iOrient = iOrientAtOnce; iOrient < iOrientEndAtOnce; iOrient++)
+    {
+      mycomplex_t *proj_mapFFT =
+          &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
+
+      // ***************************************************************************************
+      // ***** **** Internal Loop over PSF/CTF convolutions **** *****
+      for (int iConvAtOnce = 0; iConvAtOnce < param.nTotCTFs;
+           iConvAtOnce += param.nTotParallelConv)
+      {
+        if (DebugOutput >= 1)
+          timer.ResetStart();
+        int iConvEndAtOnce =
+            std::min(param.nTotCTFs, iConvAtOnce + param.nTotParallelConv);
+        // Total number of convolutions that can be treated in this iteration in
+        // parallel
+        int maxParallelConv = iConvEndAtOnce - iConvAtOnce;
+#pragma omp parallel for
+        for (int iConv = iConvAtOnce; iConv < iConvEndAtOnce; iConv++)
+        {
+          // *** Calculating convolutions of projection map and
+          // crosscorrelations ***
+          int i =
+              (iPipeline & 1) * param.nTotParallelConv + (iConv - iConvAtOnce);
+          mycomplex_t *localmultFFT = &conv_mapsFFT[i * param.FFTMapSize];
+
+          createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT,
+                                        localmultFFT, comp_params[i].sumC,
+                                        comp_params[i].sumsquareC);
+
+          comp_params[i].amp = param.CtfParam[iConv].pos[0];
+          comp_params[i].pha = param.CtfParam[iConv].pos[1];
+          comp_params[i].env = param.CtfParam[iConv].pos[2];
+        }
+        if (DebugOutput >= 1)
+        {
+          ts.time = timer.GetCurrentElapsedTime();
+          ts.Add(TS_CONVOLUTION);
+          if (DebugOutput >= 2)
+            printf("\t\tTime Convolution %d %d-%d: %f (rank %d)\n", iOrient,
+                   iConvAtOnce, iConvEndAtOnce - 1, ts.time, mpi_rank);
+        }
+
+        // ******************Internal loop over Reference images CUDA or
+        // OpenMP******************
+        // *** Comparing each calculated convoluted map with all experimental
+        // maps ***
+        ts.time = 0.;
+        if ((DebugOutput >= 1) || (Autotuning && aut.Needed(iConvAtOnce)))
+          timer.ResetStart();
+        compareRefMaps(iPipeline++, iOrient, iConvAtOnce, maxParallelConv,
+                       conv_mapsFFT, comp_params);
+        if (DebugOutput >= 1)
+        {
+          ts.time = timer.GetCurrentElapsedTime();
+          ts.Add(TS_COMPARISON);
+        }
+        if (DebugOutput >= 2)
+        {
+          if (Autotuning)
+            printf("\t\tTime Comparison %d %d-%d: %f sec with GPU workload "
+                   "%d%% (rank %d)\n",
+                   iOrient, iConvAtOnce, iConvEndAtOnce - 1, ts.time,
+                   aut.Workload(), mpi_rank);
+          else
+            printf("\t\tTime Comparison %d %d-%d: %f sec (rank %d)\n", iOrient,
+                   iConvAtOnce, iConvEndAtOnce - 1, ts.time, mpi_rank);
+        }
+        if (Autotuning && aut.Needed(iConvAtOnce))
+        {
+          if (ts.time == 0.)
+            ts.time = timer.GetCurrentElapsedTime();
+          aut.Tune(ts.time);
+          if (aut.Finished() && DebugOutput >= 1)
+            printf("\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(),
+                   mpi_rank);
+          rebalanceWrapper(aut.Workload());
+        }
+      }
       if (DebugOutput >= 1)
-	{
-	  ts.time = timer.GetCurrentElapsedTime();
-	  ts.Add(TS_PROJECTION);
-	  if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, ts.time, mpi_rank);
-	}
-      /* Recalibrate if needed */
-      if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
-	{
-	  aut.Reset();
-	  rebalanceWrapper(aut.Workload());
-	}
-
-      for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
-	{
-	  mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
-
-	  // ***************************************************************************************
-	  // ***** **** Internal Loop over PSF/CTF convolutions **** *****
-
-	  for (int iConv = 0; iConv < param.nTotCTFs; iConv++)
-	    {
-	      // *** Calculating convolutions of projection map and crosscorrelations ***
-	      if (DebugOutput >= 1) timer.ResetStart();
-	      createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
-	      if (DebugOutput >= 1)
-		{
-		  ts.time = timer.GetCurrentElapsedTime();
-		  ts.Add(TS_CONVOLUTION);
-		  if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, ts.time, mpi_rank);
-		}
-
-	      if ((DebugOutput >= 1) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
-	      myfloat_t amp,pha,env;
-
-	      amp=param.CtfParam[iConv].pos[0];
-	      pha=param.CtfParam[iConv].pos[1];
-	      env=param.CtfParam[iConv].pos[2];
-
-	      // ******************Internal loop over Reference images CUDA or OpenMP******************
-	      // *** Comparing each calculated convoluted map with all experimental maps ***
-
-	      compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
-
-	      ts.time = 0.;
-	      if (DebugOutput >= 1)
-		{
-		  ts.time = timer.GetCurrentElapsedTime();
-		  ts.Add(TS_COMPARISON);
-		}
-	      if (DebugOutput >= 2)
-		{
-		  const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
-		  const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
-		    (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / ts.time;
-		  const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
-		    (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / ts.time;
-		  const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / ts.time;
-
-		  if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, ts.time, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
-		  else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, ts.time, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
-		}
-	      if (Autotuning && aut.Needed(iConv))
-		{
-		  if (ts.time == 0.) ts.time = timer.GetCurrentElapsedTime();
-		  aut.Tune(ts.time);
-		  if (aut.Finished() && DebugOutput >= 1) printf("\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
-		  rebalanceWrapper(aut.Workload());
-		}
-	    }
-	  if (DebugOutput >= 1)
-	    {
-	      ts.time = timer2.GetCurrentElapsedTime();
-	      ts.Add(TS_TPROJECTION);
-	      printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, ts.time, mpi_rank);
-	      timer2.ResetStart();
-	    }
-	}
+      {
+        ts.time = timer2.GetCurrentElapsedTime();
+        ts.Add(TS_TPROJECTION);
+        printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient,
+               ts.time, mpi_rank);
+        timer2.ResetStart();
+      }
     }
+  }
   /* Statistical summary on different parts of the code */
   if (DebugOutput >= 1)
-    {
-      ts.PrintTimeStat(mpi_rank);
-      ts.EmptyTimeStat();
-    }
+  {
+    ts.PrintTimeStat(mpi_rank);
+    ts.EmptyTimeStat();
+  }
 
-  //deallocating fftw_complex vector
+  // deallocating fftw_complex vector
   myfftw_free(proj_mapsFFT);
-  myfftw_free(conv_mapFFT);
-  if (!FFTAlgo) myfftw_free(conv_map);
+  myfftw_free(conv_mapsFFT);
 
   deviceFinishRun();
 
-
-
-  // ************* Collecing all the probabilities from MPI replicas ***************
+// *******************************************************************************
+// ************* Collecing all the probabilities from MPI replicas
+// ***************
 
 #ifdef WITH_MPI
   if (mpi_size > 1)
+  {
+    if (DebugOutput >= 1 && mpi_rank == 0)
+      timer.ResetStart();
+    // Reduce Constant and summarize probabilities
     {
-      if (DebugOutput >= 1 && mpi_rank == 0) timer.ResetStart();
-      //Reduce Constant and summarize probabilities
+      myprob_t *tmp1 = new myprob_t[RefMap.ntotRefMap];
+      myprob_t *tmp2 = new myprob_t[RefMap.ntotRefMap];
+      myprob_t *tmp3 = new myprob_t[RefMap.ntotRefMap];
+      for (int i = 0; i < RefMap.ntotRefMap; i++)
+      {
+        tmp1[i] = pProb.getProbMap(i).Constoadd;
+      }
+      MPI_Allreduce(tmp1, tmp2, RefMap.ntotRefMap, MY_MPI_FLOAT, MPI_MAX,
+                    MPI_COMM_WORLD);
+
+      for (int i = 0; i < RefMap.ntotRefMap; i++)
       {
-	myfloat_t* tmp1 = new myfloat_t[RefMap.ntotRefMap];
-	myfloat_t* tmp2 = new myfloat_t[RefMap.ntotRefMap];
-	myfloat_t* tmp3 = new myfloat_t[RefMap.ntotRefMap];
-	for (int i = 0;i < RefMap.ntotRefMap;i++)
-	  {
-	    tmp1[i] = pProb.getProbMap(i).Constoadd;
-	  }
-	MPI_Allreduce(tmp1, tmp2, RefMap.ntotRefMap, MY_MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
-
-	for (int i = 0;i < RefMap.ntotRefMap;i++)
-	  {
-	    bioem_Probability_map& pProbMap = pProb.getProbMap(i);
+        bioem_Probability_map &pProbMap = pProb.getProbMap(i);
 #ifdef DEBUG
-	    cout << "Reduction " << mpi_rank << " Map " << i << " Prob " << pProbMap.Total << " Const " << pProbMap.Constoadd  << "\n";     
+        cout << "Reduction " << mpi_rank << " Map " << i << " Prob "
+             << pProbMap.Total << " Const " << pProbMap.Constoadd << "\n";
 #endif
-	    tmp1[i] = pProbMap.Total * exp(pProbMap.Constoadd - tmp2[i]);
-
-	  }
-	MPI_Reduce(tmp1, tmp3, RefMap.ntotRefMap, MY_MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
-
-	//Find MaxProb
-	MPI_Status mpistatus;
-	{
-	  int* tmpi1 = new int[RefMap.ntotRefMap];
-	  int* tmpi2 = new int[RefMap.ntotRefMap];
-	  for (int i = 0;i < RefMap.ntotRefMap;i++)
-	    {
-	      bioem_Probability_map& pProbMap = pProb.getProbMap(i);
-	      tmpi1[i] = tmp2[i] <= pProbMap.Constoadd ? mpi_rank : -1;
-              //temporary array that has the mpirank for the highest pProb.constant
-	    }
-	  MPI_Allreduce(tmpi1, tmpi2, RefMap.ntotRefMap, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
-	  for (int i = 0;i < RefMap.ntotRefMap;i++)
-	    {
-	      if (tmpi2[i] == -1)
-		{
-		  if (mpi_rank == 0) printf("Error: Could not find highest probability\n");
-		}
-	      else if (tmpi2[i] != 0) //Skip if rank 0 already has highest probability
-		{
-		  if (mpi_rank == 0)
-		    {
-		      MPI_Recv(&pProb.getProbMap(i).max, sizeof(pProb.getProbMap(i).max), MPI_BYTE, tmpi2[i], i, MPI_COMM_WORLD, &mpistatus);
-		    }
-		  else if (mpi_rank == tmpi2[i])
-		    {
-		      MPI_Send(&pProb.getProbMap(i).max, sizeof(pProb.getProbMap(i).max), MPI_BYTE, 0, i, MPI_COMM_WORLD);
-		    }
-		}
-	    }
-	  delete[] tmpi1;
-	  delete[] tmpi2;
-	}
-
-	if (mpi_rank == 0)
-	  {
-	    for (int i = 0;i < RefMap.ntotRefMap;i++)
-	      {
-		bioem_Probability_map& pProbMap = pProb.getProbMap(i);
-		pProbMap.Total = tmp3[i];
-		pProbMap.Constoadd = tmp2[i];
-	      }
-	  }
-
-	delete[] tmp1;
-	delete[] tmp2;
-	delete[] tmp3;
-	if (DebugOutput >= 1 && mpi_rank == 0 && mpi_size > 1) printf("Time MPI Reduction: %f\n", timer.GetCurrentElapsedTime());
+        tmp1[i] = pProbMap.Total * exp(pProbMap.Constoadd - tmp2[i]);
       }
+      MPI_Reduce(tmp1, tmp3, RefMap.ntotRefMap, MY_MPI_FLOAT, MPI_SUM, 0,
+                 MPI_COMM_WORLD);
 
-      //Angle Reduction and Probability summation for individual angles
-      if (param.param_device.writeAngles)
-	{
-	  const int count = RefMap.ntotRefMap * param.nTotGridAngles;
-	  myfloat_t* tmp1 = new myfloat_t[count];
-	  myfloat_t* tmp2 = new myfloat_t[count];
-	  myfloat_t* tmp3 = new myfloat_t[count];
-	  for (int i = 0;i < RefMap.ntotRefMap;i++)
-	    {
-	      for (int j = 0;j < param.nTotGridAngles;j++)
-                {
-		  //	      tmp1[i] = pProb.getProbMap(i).Constoadd;
-		  //	      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(i, j);
-		  tmp1[i * param.nTotGridAngles + j]= pProb.getProbAngle(i, j).ConstAngle;
-		}
-	    }
-
-	  MPI_Allreduce(tmp1, tmp2, count, MY_MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
-	  for (int i = 0;i < RefMap.ntotRefMap;i++)
-	    {
-	      for (int j = 0;j < param.nTotGridAngles;j++)
-		{
-		  bioem_Probability_angle& pProbAngle = pProb.getProbAngle(i, j);
-		  tmp1[i * param.nTotGridAngles + j] = pProbAngle.forAngles * exp(pProbAngle.ConstAngle - tmp2[i * param.nTotGridAngles + j]);
-		}
-	    }
-	  MPI_Reduce(tmp1, tmp3, count, MY_MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
-	  if (mpi_rank == 0)
-	    {
-	      for (int i = 0;i < RefMap.ntotRefMap;i++)
-		{
-		  for (int j = 0;j < param.nTotGridAngles;j++)
-		    {
-		      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(i, j);
-		      pProbAngle.forAngles = tmp3[i * param.nTotGridAngles + j];
-		      pProbAngle.ConstAngle = tmp2[i * param.nTotGridAngles + j];
-		    }
-		}
-	    }
-	  delete[] tmp1;
-	  delete[] tmp2;
-	  delete[] tmp3;
-	}
+      // Find MaxProb
+      MPI_Status mpistatus;
+      {
+        int *tmpi1 = new int[RefMap.ntotRefMap];
+        int *tmpi2 = new int[RefMap.ntotRefMap];
+        for (int i = 0; i < RefMap.ntotRefMap; i++)
+        {
+          bioem_Probability_map &pProbMap = pProb.getProbMap(i);
+          tmpi1[i] = tmp2[i] <= pProbMap.Constoadd ? mpi_rank : -1;
+          // temporary array that has the mpirank for the highest pProb.constant
+        }
+        MPI_Allreduce(tmpi1, tmpi2, RefMap.ntotRefMap, MPI_INT, MPI_MAX,
+                      MPI_COMM_WORLD);
+        for (int i = 0; i < RefMap.ntotRefMap; i++)
+        {
+          if (tmpi2[i] == -1)
+          {
+            if (mpi_rank == 0)
+              printf("Error: Could not find highest probability\n");
+          }
+          else if (tmpi2[i] !=
+                   0) // Skip if rank 0 already has highest probability
+          {
+            if (mpi_rank == 0)
+            {
+              MPI_Recv(&pProb.getProbMap(i).max,
+                       sizeof(pProb.getProbMap(i).max), MPI_BYTE, tmpi2[i], i,
+                       MPI_COMM_WORLD, &mpistatus);
+            }
+            else if (mpi_rank == tmpi2[i])
+            {
+              MPI_Send(&pProb.getProbMap(i).max,
+                       sizeof(pProb.getProbMap(i).max), MPI_BYTE, 0, i,
+                       MPI_COMM_WORLD);
+            }
+          }
+        }
+        delete[] tmpi1;
+        delete[] tmpi2;
+      }
+
+      if (mpi_rank == 0)
+      {
+        for (int i = 0; i < RefMap.ntotRefMap; i++)
+        {
+          bioem_Probability_map &pProbMap = pProb.getProbMap(i);
+          pProbMap.Total = tmp3[i];
+          pProbMap.Constoadd = tmp2[i];
+        }
+      }
+
+      delete[] tmp1;
+      delete[] tmp2;
+      delete[] tmp3;
+      if (DebugOutput >= 1 && mpi_rank == 0 && mpi_size > 1)
+        printf("Time MPI Reduction: %f\n", timer.GetCurrentElapsedTime());
     }
-#endif
 
+    // Angle Reduction and Probability summation for individual angles
+    if (param.param_device.writeAngles)
+    {
+      const int count = RefMap.ntotRefMap * param.nTotGridAngles;
+      myprob_t *tmp1 = new myprob_t[count];
+      myprob_t *tmp2 = new myprob_t[count];
+      myprob_t *tmp3 = new myprob_t[count];
+      for (int i = 0; i < RefMap.ntotRefMap; i++)
+      {
+        for (int j = 0; j < param.nTotGridAngles; j++)
+        {
+          //	      tmp1[i] = pProb.getProbMap(i).Constoadd;
+          //	      bioem_Probability_angle& pProbAngle =
+          // pProb.getProbAngle(i, j);
+          tmp1[i * param.nTotGridAngles + j] =
+              pProb.getProbAngle(i, j).ConstAngle;
+        }
+      }
+
+      MPI_Allreduce(tmp1, tmp2, count, MY_MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
+      for (int i = 0; i < RefMap.ntotRefMap; i++)
+      {
+        for (int j = 0; j < param.nTotGridAngles; j++)
+        {
+          bioem_Probability_angle &pProbAngle = pProb.getProbAngle(i, j);
+          tmp1[i * param.nTotGridAngles + j] =
+              pProbAngle.forAngles *
+              exp(pProbAngle.ConstAngle - tmp2[i * param.nTotGridAngles + j]);
+        }
+      }
+      MPI_Reduce(tmp1, tmp3, count, MY_MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
+      if (mpi_rank == 0)
+      {
+        for (int i = 0; i < RefMap.ntotRefMap; i++)
+        {
+          for (int j = 0; j < param.nTotGridAngles; j++)
+          {
+            bioem_Probability_angle &pProbAngle = pProb.getProbAngle(i, j);
+            pProbAngle.forAngles = tmp3[i * param.nTotGridAngles + j];
+            pProbAngle.ConstAngle = tmp2[i * param.nTotGridAngles + j];
+          }
+        }
+      }
+      delete[] tmp1;
+      delete[] tmp2;
+      delete[] tmp3;
+    }
+  }
+#endif
 
   // ************* Writing Out Probabilities ***************
   if (mpi_rank == 0)
+  {
+
+    // Output for Angle Probability File
+    ofstream angProbfile;
+    angProbfile.precision(OUTPUT_PRECISION);
+    angProbfile.setf(ios::fixed);
+    if (param.param_device.writeAngles)
+    {
+      angProbfile.open("ANG_PROB");
+      angProbfile << "************************* HEADER:: NOTATION "
+                     "*******************************************\n";
+      if (!param.doquater)
+      {
+        angProbfile << " RefMap:  MapNumber ; alpha[rad] - beta[rad] - "
+                       "gamma[rad] - logP - cal log Probability + Constant: "
+                       "Numerical Const.+ log (volume) + prior ang\n";
+      }
+      else
+      {
+        angProbfile << " RefMap:  MapNumber ; q1 - q2 -q3 - logP- cal log "
+                       "Probability + Constant: Numerical Const. + log "
+                       "(volume) + prior ang\n";
+      };
+      angProbfile << "************************* HEADER:: NOTATION "
+                     "*******************************************\n";
+      //          angProbfile <<"Model Used: " << modelfile.c_str() << "\n";
+      //          angProbfile <<"Input Used: " << infile.c_str() << "\n";
+    }
+
+    // Output for Standard Probability
+    ofstream outputProbFile;
+    outputProbFile.precision(OUTPUT_PRECISION);
+    outputProbFile.setf(ios::fixed);
+    outputProbFile.open(OutfileName.c_str());
+    outputProbFile << "************************* HEADER:: NOTATION "
+                      "*******************************************\n";
+    outputProbFile << "Notation= RefMap:  MapNumber ; LogProb natural "
+                      "logarithm of posterior Probability ; Constant: "
+                      "Numerical Const. for adding Probabilities \n";
+    if (!param.doquater)
+    {
+      if (param.usepsf)
+      {
+        outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: "
+                          "MaxLogProb - alpha[rad] - beta[rad] - gamma[rad] - "
+                          "PSF amp - PSF phase - PSF envelope - center x - "
+                          "center y - normalization - offsett \n";
+      }
+      else
+      {
+        outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: "
+                          "MaxLogProb - alpha[rad] - beta[rad] - gamma[rad] - "
+                          "CTF amp - CTF defocus - CTF B-Env - center x - "
+                          "center y - normalization - offsett \n";
+      }
+    }
+    else
     {
- 
-      // Output for Angle Probability File
-      ofstream angProbfile;
-      if(param.param_device.writeAngles)
-	{
-	  angProbfile.open ("ANG_PROB");
-	  angProbfile <<"************************* HEADER:: NOTATION *******************************************\n";
-          if(!param.doquater){ angProbfile <<" RefMap:  MapNumber ; alpha[rad] - beta[rad] - gamma[rad] - logP - cal log Probability + Constant: Numerical Const.+ log (volume) + prior ang\n" ;}
-	  else { angProbfile <<" RefMap:  MapNumber ; q1 - q2 -q3 - logP- cal log Probability + Constant: Numerical Const. + log (volume) + prior ang\n" ;};
-	  angProbfile <<"************************* HEADER:: NOTATION *******************************************\n";
-	  //          angProbfile <<"Model Used: " << modelfile.c_str() << "\n";
-	  //          angProbfile <<"Input Used: " << infile.c_str() << "\n";
-	}
-      // Output for Cross Correlation File
-      ofstream ccProbfile;
-      if(param.param_device.writeCC)
-	{
-	  ccProbfile.open ("CROSS_CORRELATION");
-	  ccProbfile <<"************************* HEADER:: NOTATION *******************************************\n";
-          ccProbfile <<" RefMap:  MapNumber ; Pixel x - Pixel y - Cross-Correlation \n";
-          ccProbfile <<"Note that the highest Cross-correlation is the best.\n";
-          ccProbfile <<"If the particles are flipped, include the keyward FLIPPED in the Param file.\n";
-          ccProbfile <<"************************* HEADER:: NOTATION *******************************************\n";
-	}
-
-      // Output for Standard Probability
-      ofstream outputProbFile;
-      if(!yesoutfilename)OutfileName="Output_Probabilities";
-      outputProbFile.open (OutfileName.c_str());
-      outputProbFile <<"************************* HEADER:: NOTATION *******************************************\n";   
-      outputProbFile << "Notation= RefMap:  MapNumber ; LogProb natural logarithm of posterior Probability ; Constant: Numerical Const. for adding Probabilities \n";
-      if(!param.doquater){
-	if(param.usepsf){
-	  outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: MaxLogProb - alpha[rad] - beta[rad] - gamma[rad] - PSF amp - PSF phase - PSF envelope - center x - center y - normalization - offsett \n";}else{
-	  outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: MaxLogProb - alpha[rad] - beta[rad] - gamma[rad] - CTF amp - CTF defocus - CTF B-Env - center x - center y - normalization - offsett \n";}
-      }else { 
-	if(param.usepsf){
-	  //     if( localcc[rx * param.param_device.NumberPixels + ry] <
-	  outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: MaxLogProb - q1 - q2 - q3 - q4 -PSF amp - PSF phase - PSF envelope - center x - center y - normalization - offsett \n";
-	}else{
-          outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: MaxLogProb - q1 - q2 - q3 - q4 - CTF amp - CTF defocus - CTF B-Env - center x - center y - normalization - offsett \n";
-        }}
-      if(param.writeCTF) outputProbFile << " RefMap:  MapNumber ; CTFMaxParm: defocus - b-Env (B ref. Penzeck 2010)\n";
-      if(param.yespriorAngles) outputProbFile << "**** Remark: Using Prior Proability in Angles ****\n";
-      outputProbFile <<"************************* HEADER:: NOTATION *******************************************\n\n";
-
-       
-      // Loop over reference maps
-      // ************* Over all maps ***************
-
-      for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap ++)
-	{
-	  // **** Total Probability ***
-	  bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
-
-	  //Controll for Value of Total Probability
-          // cout << pProbMap.Total << " " <<  pProbMap.Constoadd << " " << FLT_MAX <<" " << log(FLT_MAX) << "\n";
-          if(pProbMap.Total>1.e-38){
-
-	    outputProbFile << "RefMap: " << iRefMap << " LogProb:  "  << log(pProbMap.Total) + pProbMap.Constoadd + 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << " Constant: " << pProbMap.Constoadd  << "\n";
-	    outputProbFile << "RefMap: " << iRefMap << " Maximizing Param: ";
-            // *** Param that maximize probability****
-            outputProbFile << (log(pProbMap.Total) + pProbMap.Constoadd + 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5) * (log(2 * M_PI) + 1) + log(param.param_device.volu)) << " ";
-
-
-	  }else{ 
-	    outputProbFile << "Warining! with Map " << iRefMap << "Numerical Integrated Probability without constant = 0.0;\n";
-	    outputProbFile << "Warining RefMap: " << iRefMap << "Check that constant is finite: " << pProbMap.Constoadd  << "\n"; 
-	    outputProbFile << "Warining RefMap: i) check model, ii) check refmap , iii) check GPU on/off command inconsitency\n";
-	    //	    outputProbFile << "Warning! " << iRefMap << " LogProb:  "  << pProbMap.Constoadd + 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << " Constant: " << pProbMap.Constoadd  << "\n";
-	  }
-	  //	    outputProbFile << "RefMap: " << iRefMap << " Maximizing Param: ";
-
-	  // *** Param that maximize probability****
-	  //	    outputProbFile << (pProbMap.Constoadd + 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5) * (log(2 * M_PI) + 1) + log(param.param_device.volu)) << " ";
-
-	  outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[0] << " [] ";
-	  outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[1] << " [] ";
-	  outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[2] << " [] ";
-	  if(param.doquater)outputProbFile << param.angles[pProbMap.max.max_prob_orient].quat4 << " [] "; 
-	  outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[0] << " [] ";
-	  if(!param.usepsf){outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[1]/ 2.f /M_PI / param.elecwavel * 0.0001 << " [micro-m] ";
-	  }else{outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[1] << " [1/A²] ";}
-	  if(!param.usepsf){outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[2] << " [A²] ";}
-	  else{outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[2] << " [1/A²] ";}
-	  outputProbFile << pProbMap.max.max_prob_cent_x << " [pix] ";
-	  outputProbFile << pProbMap.max.max_prob_cent_y << " [pix] " ;
-	  if(FFTAlgo){outputProbFile << pProbMap.max.max_prob_norm << " [] " ;}else{outputProbFile << "N.A." << " [] ";}
-	  if(FFTAlgo){outputProbFile << pProbMap.max.max_prob_mu << " [] ";}else{outputProbFile << "N.A." << " [] ";} 
-	  outputProbFile << "\n";
-
-	  // Writing out CTF parameters if requiered
-	  if(param.writeCTF && param.usepsf){
-
-	    myfloat_t denomi;
-	    denomi = param.CtfParam[pProbMap.max.max_prob_conv].pos[1] * param.CtfParam[pProbMap.max.max_prob_conv].pos[1] + 
-	      param.CtfParam[pProbMap.max.max_prob_conv].pos[2] * param.CtfParam[pProbMap.max.max_prob_conv].pos[2];
-	    outputProbFile << "RefMap: " << iRefMap << " CTFMaxParam: ";
-	    outputProbFile <<  2*M_PI*param.CtfParam[pProbMap.max.max_prob_conv].pos[1]/denomi/param.elecwavel*0.0001 << " [micro-m] "; 
-	    outputProbFile << 4*M_PI*M_PI*param.CtfParam[pProbMap.max.max_prob_conv].pos[2]/denomi << " [A²] \n";
-	  }
-
-	  //*************** Writing Individual Angle probabilities
-	  if(param.param_device.writeAngles)
-	    {
-	      for (int iOrient = 0; iOrient < param.nTotGridAngles; iOrient++)
-		{
-		  bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
-
-		  myfloat_t logp=log(pProbAngle.forAngles)+ pProbAngle.ConstAngle+0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu);
-		  if(!param.doquater){
-		    // For Euler Angles
-		    if(param.yespriorAngles){
-		      logp+=param.angprior[iOrient];
-		      angProbfile << " " << iRefMap << " " << param.angles[iOrient].pos[0] << " " << param.angles[iOrient].pos[1] << " " << param.angles[iOrient].pos[2] << " " << logp << " Separated: "
-				  << log(pProbAngle.forAngles) << " " << pProbAngle.ConstAngle  << " " << 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << " " << param.angprior[iOrient] << "\n";
-		    } else
-		      {
-			angProbfile << " " << iRefMap << " " << param.angles[iOrient].pos[0] << " " << param.angles[iOrient].pos[1] << " " << param.angles[iOrient].pos[2] << " " <<  logp << " Separated: "<<
-			  log(pProbAngle.forAngles) << " " << pProbAngle.ConstAngle  << " " << 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << "\n";
-		      }
-		  }else {
-		    // Samething but for Quaternions
-		    if(param.yespriorAngles){
-		      logp+=param.angprior[iOrient];
-		      angProbfile << " " << iRefMap << " " << param.angles[iOrient].pos[0] << " " << param.angles[iOrient].pos[1] << " " << param.angles[iOrient].pos[2] << " " << param.angles[iOrient].quat4 << " " << logp << " Separated: " << log(pProbAngle.forAngles) << " " << pProbAngle.ConstAngle  << " " << 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << " " << param.angprior[iOrient] << "\n";
-		    } else
-		      {
-			angProbfile << " " << iRefMap << " " << param.angles[iOrient].pos[0] << " " << param.angles[iOrient].pos[1] << " " << param.angles[iOrient].pos[2] << " " << param.angles[iOrient].quat4 << " " << logp << " Separated: "<<
-			  log(pProbAngle.forAngles) << " " << pProbAngle.ConstAngle  << " " << 0.5 * log(M_PI) + (1 - param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) + log(param.param_device.volu) << "\n";
-		      }
-		  }
-		}
-	    }
-	
-	  //************* Writing Cross-Correlations if requiered
-          //************* This is currently not in the manual *****
-	  if(param.param_device.writeCC){
-
-	    int  cc=0;
-	    int halfPix;
-	    int rx=0;
-	    int ry=0;
-	    myfloat_t localcc[ (param.param_device.NumberPixels+1) * (param.param_device.NumberPixels+1) ];
-            int used[(param.param_device.NumberPixels+1) * (param.param_device.NumberPixels+1)];
-
-	    halfPix = param.param_device.NumberPixels / 2 ;
-	    // Ordering the centers of the Cross Correlation
-
-	    for (int rx = 0; rx < param.param_device.NumberPixels ; rx++)
-	      {
-		for (int ry = 0; ry < param.param_device.NumberPixels ; ry++)
-		  {
-		    localcc[ rx * param.param_device.NumberPixels + ry ] = 0.0;
-			used[ rx * param.param_device.NumberPixels + ry ]= 0;
-		  }
-	      }
-
-	    for (int cent_x = 0; cent_x < param.param_device.NumberPixels ; cent_x = cent_x + param.param_device.CCdisplace)
-	      {
-		for (int cent_y = 0; cent_y < param.param_device.NumberPixels ; cent_y = cent_y + param.param_device.CCdisplace)
-		  {
-		    //localcc[ rx * param.param_device.NumberPixels + ry ] = 0.0;
-		    bioem_Probability_cc& pProbCC = pProb.getProbCC(iRefMap, cc);
-
-		    // Applying Periodic boundary conditions to the CC
-		    if(cent_x < halfPix && cent_y < halfPix){
-		      //	ccProbfile << " " << iRefMap << " " << (myfloat_t) halfPix  - cent_x << " " << halfPix - cent_y << " " << pProbCC.forCC <<"\n";
-		      rx = halfPix  - cent_x;
-		      ry = halfPix  - cent_y;}
-		    if(cent_x >= halfPix && cent_y < halfPix){
-		      //      ccProbfile << " " << iRefMap << " " << (myfloat_t) 3 * halfPix  - cent_x << " " << halfPix - cent_y << " " << pProbCC.forCC <<"\n"; 
-		      rx = 3 * halfPix  - cent_x;
-		      ry = halfPix  - cent_y;}
-		    if(cent_x < halfPix && cent_y >= halfPix){
-		      //      ccProbfile << " " << iRefMap << " " << (myfloat_t) halfPix  - cent_x << " " << 3 * halfPix - cent_y << " " << pProbCC.forCC <<"\n";
-		      rx = halfPix  - cent_x;
-		      ry = 3 * halfPix  - cent_y;}
-		    if(cent_x >= halfPix && cent_y >= halfPix){
-		      //        ccProbfile << " " << iRefMap << " " << 3* halfPix  - cent_x << " " << 3 * halfPix - cent_y << " " << pProbCC.forCC <<"\n";
-		      rx = 3 * halfPix  - cent_x;
-		      ry = 3 * halfPix  - cent_y;}
-		    //						cout << " TT " << cent_x << " " << rx << " " << cent_y << " " << ry << " " <<  pProbCC.forCC << "\n";
-		    if(!param.param_device.CCwithBayes){
-		      localcc[ rx * param.param_device.NumberPixels + ry ] = pProbCC.forCC;
-		    }else{ 
-		      localcc[ rx * param.param_device.NumberPixels + ry ] = log(pProbCC.forCC)+pProbCC.ConstCC;
-		    }
- 			used[ rx * param.param_device.NumberPixels + ry] = 1;
-		    cc++;
-		  }
-		//              ccProbfile << "\n";
-	      }
-	    if(!param.ignoreCCoff){
-/*	      for (int rx = param.param_device.CCdisplace; rx < param.param_device.NumberPixels ; rx = rx + param.param_device.CCdisplace)
-		{
-		  for (int ry = param.param_device.CCdisplace; ry < param.param_device.NumberPixels ; ry = ry + param.param_device.CCdisplace)
-		    {*/
-  for (int rx = param.param_device.CCdisplace; rx < param.param_device.NumberPixels ; rx++)
-                {
-                  for (int ry = param.param_device.CCdisplace; ry < param.param_device.NumberPixels ; ry++)
-                    {
-
-		      if(used[ rx * param.param_device.NumberPixels + ry ] == 1){
-			ccProbfile << "RefMap: "<< iRefMap << " " << rx << " " << ry << " " << localcc[ rx * param.param_device.NumberPixels + ry ] << "\n" ;
-		      }else{
-			if(localcc[ rx * param.param_device.NumberPixels + ry ] <= -FLT_MAX)ccProbfile << "RefMap: "<< iRefMap << " " << rx << " " << ry << " " << -FLT_MAX << "\n" ;
-		      }
-		      //				 cout << " cc " << rx << " " << ry << " " << localcc[ rx * param.param_device.NumberPixels + ry ] <<"\n" ;
-		    }
-	//	  ccProbfile << "\n";
-		}			
-	    }else{
-	      for (int rx = param.param_device.CCdisplace; rx < param.param_device.NumberPixels ; rx++)
-		{
-		  for (int ry = param.param_device.CCdisplace; ry < param.param_device.NumberPixels ; ry++)
-		    {
-                         if(used[ rx * param.param_device.NumberPixels + ry ] == 1){
-                        ccProbfile << "RefMap: "<< iRefMap << " " << rx << " " << ry << " " << localcc[ rx * param.param_device.NumberPixels + ry ] << "\n" ;
-		      }else{
-                        if(localcc[ rx * param.param_device.NumberPixels + ry ] <= -FLT_MAX)ccProbfile << "RefMap: "<< iRefMap << " " << rx << " " << ry << " " << -FLT_MAX << "\n" ;
-		      }
-		    }
-	//	  ccProbfile << "\n";
-		}
-
-	    }
-	  }
-	}
-
-      if(param.param_device.writeAngles)
-	{
-	  angProbfile.close();
-	}
-
-      if(param.param_device.writeCC)
-	{
-	  ccProbfile.close();
-	}
-
-      outputProbFile.close();
+      if (param.usepsf)
+      {
+        //     if( localcc[rx * param.param_device.NumberPixels + ry] <
+        outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: "
+                          "MaxLogProb - q1 - q2 - q3 - q4 -PSF amp - PSF phase "
+                          "- PSF envelope - center x - center y - "
+                          "normalization - offsett \n";
+      }
+      else
+      {
+        outputProbFile << "Notation= RefMap:  MapNumber ; Maximizing Param: "
+                          "MaxLogProb - q1 - q2 - q3 - q4 - CTF amp - CTF "
+                          "defocus - CTF B-Env - center x - center y - "
+                          "normalization - offsett \n";
+      }
     }
+    if (param.writeCTF)
+      outputProbFile << " RefMap:  MapNumber ; CTFMaxParm: defocus - b-Env (B "
+                        "ref. Penzeck 2010)\n";
+    if (param.yespriorAngles)
+      outputProbFile << "**** Remark: Using Prior Proability in Angles ****\n";
+    outputProbFile << "************************* HEADER:: NOTATION "
+                      "*******************************************\n\n";
+
+    // Loop over reference maps
+    // ************* Over all maps ***************
+
+    for (int iRefMap = 0; iRefMap < RefMap.ntotRefMap; iRefMap++)
+    {
+      // **** Total Probability ***
+      bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
 
-  return(0);
+      // Controll for Value of Total Probability
+      // cout << pProbMap.Total << " " <<  pProbMap.Constoadd << " " << FLT_MAX
+      // <<" " << log(FLT_MAX) << "\n";
+      if (pProbMap.Total > 1.e-38)
+      {
+
+        outputProbFile << "RefMap: " << iRefMap << " LogProb:  "
+                       << log(pProbMap.Total) + pProbMap.Constoadd +
+                              0.5 * log(M_PI) +
+                              (1 - param.param_device.Ntotpi * 0.5) *
+                                  (log(2 * M_PI) + 1) +
+                              log(param.param_device.volu)
+                       << " Constant: " << pProbMap.Constoadd << "\n";
+        outputProbFile << "RefMap: " << iRefMap << " Maximizing Param: ";
+        // *** Param that maximize probability****
+        outputProbFile << (log(pProbMap.Total) + pProbMap.Constoadd +
+                           0.5 * log(M_PI) +
+                           (1 - param.param_device.Ntotpi * 0.5) *
+                               (log(2 * M_PI) + 1) +
+                           log(param.param_device.volu))
+                       << " ";
+      }
+      else
+      {
+        outputProbFile
+            << "Warining! with Map " << iRefMap
+            << "Numerical Integrated Probability without constant = 0.0;\n";
+        outputProbFile << "Warining RefMap: " << iRefMap
+                       << "Check that constant is finite: "
+                       << pProbMap.Constoadd << "\n";
+        outputProbFile << "Warining RefMap: i) check model, ii) check refmap , "
+                          "iii) check GPU on/off command inconsitency\n";
+        //	    outputProbFile << "Warning! " << iRefMap << " LogProb:  "
+        //<< pProbMap.Constoadd + 0.5 * log(M_PI) + (1 -
+        // param.param_device.Ntotpi * 0.5)*(log(2 * M_PI) + 1) +
+        // log(param.param_device.volu) << " Constant: " << pProbMap.Constoadd
+        //<< "\n";
+      }
+      //	    outputProbFile << "RefMap: " << iRefMap << " Maximizing
+      // Param: ";
+
+      // *** Param that maximize probability****
+      //	    outputProbFile << (pProbMap.Constoadd + 0.5 * log(M_PI) + (1
+      //- param.param_device.Ntotpi * 0.5) * (log(2 * M_PI) + 1) +
+      // log(param.param_device.volu)) << " ";
+
+      outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[0]
+                     << " [] ";
+      outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[1]
+                     << " [] ";
+      outputProbFile << param.angles[pProbMap.max.max_prob_orient].pos[2]
+                     << " [] ";
+      if (param.doquater)
+        outputProbFile << param.angles[pProbMap.max.max_prob_orient].quat4
+                       << " [] ";
+      outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[0]
+                     << " [] ";
+      if (!param.usepsf)
+      {
+        outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[1] /
+                              2.f / M_PI / param.elecwavel * 0.0001
+                       << " [micro-m] ";
+      }
+      else
+      {
+        outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[1]
+                       << " [1/A²] ";
+      }
+      if (!param.usepsf)
+      {
+        outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[2]
+                       << " [A²] ";
+      }
+      else
+      {
+        outputProbFile << param.CtfParam[pProbMap.max.max_prob_conv].pos[2]
+                       << " [1/A²] ";
+      }
+      outputProbFile << pProbMap.max.max_prob_cent_x << " [pix] ";
+      outputProbFile << pProbMap.max.max_prob_cent_y << " [pix] ";
+      outputProbFile << pProbMap.max.max_prob_norm << " [] ";
+      outputProbFile << pProbMap.max.max_prob_mu << " [] ";
+      outputProbFile << "\n";
+
+      // Writing out CTF parameters if requiered
+      if (param.writeCTF && param.usepsf)
+      {
+
+        myfloat_t denomi;
+        denomi = param.CtfParam[pProbMap.max.max_prob_conv].pos[1] *
+                     param.CtfParam[pProbMap.max.max_prob_conv].pos[1] +
+                 param.CtfParam[pProbMap.max.max_prob_conv].pos[2] *
+                     param.CtfParam[pProbMap.max.max_prob_conv].pos[2];
+        outputProbFile << "RefMap: " << iRefMap << " CTFMaxParam: ";
+        outputProbFile
+            << 2 * M_PI * param.CtfParam[pProbMap.max.max_prob_conv].pos[1] /
+                   denomi / param.elecwavel * 0.0001
+            << " [micro-m] ";
+        outputProbFile
+            << 4 * M_PI * M_PI *
+                   param.CtfParam[pProbMap.max.max_prob_conv].pos[2] / denomi
+            << " [A²] \n";
+      }
+
+      //*************** Writing Individual Angle probabilities
+      if (param.param_device.writeAngles)
+      {
+        // Finding the best param.param_device.writeAngles probabilities
+        // This implementation is clean, but not the most optimal one
+        // and it supposes param.param_device.writeAngles <<
+        // param.nTotGridAngles
+        unsigned K =
+            param.param_device.writeAngles; // number of best probabilities
+                                            // clang-format off
+        std::priority_queue<std::pair<double, int>,
+                            std::vector<std::pair<double, int> >,
+                            std::greater<std::pair<double, int> > >
+            q;
+        // clang-format on
+        for (int iOrient = 0; iOrient < param.nTotGridAngles; iOrient++)
+        {
+          bioem_Probability_angle &pProbAngle =
+              pProb.getProbAngle(iRefMap, iOrient);
+
+          myprob_t logp =
+              log(pProbAngle.forAngles) + pProbAngle.ConstAngle +
+              0.5 * log(M_PI) +
+              (1 - param.param_device.Ntotpi * 0.5) * (log(2 * M_PI) + 1) +
+              log(param.param_device.volu);
+
+          if (q.size() < K)
+            q.push(std::pair<double, int>(logp, iOrient));
+          else if (q.top().first < logp)
+          {
+            q.pop();
+            q.push(std::pair<double, int>(logp, iOrient));
+          }
+        }
+        K = q.size();
+        int *rev_iOrient = (int *) malloc(K * sizeof(int));
+        myprob_t *rev_logp = (myprob_t *) malloc(K * sizeof(myprob_t));
+        for (int i = K - 1; i >= 0; i--)
+        {
+          rev_iOrient[i] = q.top().second;
+          rev_logp[i] = q.top().first;
+          q.pop();
+        }
+        for (unsigned i = 0; i < K; i++)
+        {
+          int iOrient = rev_iOrient[i];
+          bioem_Probability_angle &pProbAngle =
+              pProb.getProbAngle(iRefMap, iOrient);
+          myprob_t logp = rev_logp[i];
+
+          if (!param.doquater)
+          {
+            // For Euler Angles
+            if (param.yespriorAngles)
+            {
+              logp += param.angprior[iOrient];
+              angProbfile << " " << iRefMap << " "
+                          << param.angles[iOrient].pos[0] << " "
+                          << param.angles[iOrient].pos[1] << " "
+                          << param.angles[iOrient].pos[2] << " " << logp
+                          << " Separated: " << log(pProbAngle.forAngles) << " "
+                          << pProbAngle.ConstAngle << " "
+                          << 0.5 * log(M_PI) +
+                                 (1 - param.param_device.Ntotpi * 0.5) *
+                                     (log(2 * M_PI) + 1) +
+                                 log(param.param_device.volu)
+                          << " " << param.angprior[iOrient] << "\n";
+            }
+            else
+            {
+              angProbfile << " " << iRefMap << " "
+                          << param.angles[iOrient].pos[0] << " "
+                          << param.angles[iOrient].pos[1] << " "
+                          << param.angles[iOrient].pos[2] << " " << logp
+                          << " Separated: " << log(pProbAngle.forAngles) << " "
+                          << pProbAngle.ConstAngle << " "
+                          << 0.5 * log(M_PI) +
+                                 (1 - param.param_device.Ntotpi * 0.5) *
+                                     (log(2 * M_PI) + 1) +
+                                 log(param.param_device.volu)
+                          << "\n";
+            }
+          }
+          else
+          {
+            // Samething but for Quaternions
+            if (param.yespriorAngles)
+            {
+              logp += param.angprior[iOrient];
+              angProbfile << " " << iRefMap << " "
+                          << param.angles[iOrient].pos[0] << " "
+                          << param.angles[iOrient].pos[1] << " "
+                          << param.angles[iOrient].pos[2] << " "
+                          << param.angles[iOrient].quat4 << " " << logp
+                          << " Separated: " << log(pProbAngle.forAngles) << " "
+                          << pProbAngle.ConstAngle << " "
+                          << 0.5 * log(M_PI) +
+                                 (1 - param.param_device.Ntotpi * 0.5) *
+                                     (log(2 * M_PI) + 1) +
+                                 log(param.param_device.volu)
+                          << " " << param.angprior[iOrient] << "\n";
+            }
+            else
+            {
+              angProbfile << " " << iRefMap << " "
+                          << param.angles[iOrient].pos[0] << " "
+                          << param.angles[iOrient].pos[1] << " "
+                          << param.angles[iOrient].pos[2] << " "
+                          << param.angles[iOrient].quat4 << " " << logp
+                          << " Separated: " << log(pProbAngle.forAngles) << " "
+                          << pProbAngle.ConstAngle << " "
+                          << 0.5 * log(M_PI) +
+                                 (1 - param.param_device.Ntotpi * 0.5) *
+                                     (log(2 * M_PI) + 1) +
+                                 log(param.param_device.volu)
+                          << "\n";
+            }
+          }
+        }
+        free(rev_iOrient);
+        free(rev_logp);
+      }
+    }
+
+    if (param.param_device.writeAngles)
+    {
+      angProbfile.close();
+    }
+
+    outputProbFile.close();
+  }
+
+  return (0);
 }
 
-int bioem::compareRefMaps(int iOrient, int iConv,  myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
+int bioem::compareRefMaps(int iPipeline, int iOrient, int iConvStart,
+                          int maxParallelConv, mycomplex_t *localmultFFT,
+                          myparam5_t *comp_params, const int startMap)
 {
-
   //***************************************************************************************
   //***** BioEM routine for comparing reference maps to convoluted maps *****
   //***************************************************************************************
-  cuda_custom_timeslot("Comparison", iOrient, iConv, COLOR_COMPARISON);
-  if (FFTAlgo)
-    {
-      //With FFT Algorithm
+  cuda_custom_timeslot("Comparison", iOrient, iConvStart, COLOR_COMPARISON);
+
+  int k = (iPipeline & 1) * param.nTotParallelConv;
+
+  if (BioEMAlgo == 1)
+  {
 #pragma omp parallel for schedule(dynamic, 1)
-      for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
-	{
-	  const int num = omp_get_thread_num();
-	  calculateCCFFT(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, localmultFFT, param.fft_scratch_complex[num], param.fft_scratch_real[num]);
-	}
+    for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap++)
+    {
+      const int num = omp_get_thread_num();
+      for (int iConv = 0; iConv < maxParallelConv; iConv++)
+      {
+        calculateCCFFT(iRefMap, &localmultFFT[(k + iConv) * param.FFTMapSize],
+                       param.fft_scratch_complex[num],
+                       param.fft_scratch_real[num]);
+        doRefMapFFT(
+            iRefMap, iOrient, iConvStart + iConv, comp_params[k + iConv].amp,
+            comp_params[k + iConv].pha, comp_params[k + iConv].env,
+            comp_params[k + iConv].sumC, comp_params[k + iConv].sumsquareC,
+            param.fft_scratch_real[num], pProb, param.param_device, RefMap);
+      }
     }
+  }
   else
+  {
+    myblockCPU_t *comp_blocks = new myblockCPU_t[maxParallelConv];
+    for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap++)
     {
-      //Without FFT Algorithm
 #pragma omp parallel for schedule(dynamic, 1)
-      for (int iRefMap = startMap; iRefMap < RefMap.ntotRefMap; iRefMap ++)
-	{
-	  compareRefMapShifted < -1 > (iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, conv_map, pProb, param.param_device, RefMap);
-	}
+      for (int iConv = 0; iConv < maxParallelConv; iConv++)
+      {
+        const int num = omp_get_thread_num();
+        calculateCCFFT(iRefMap, &localmultFFT[(k + iConv) * param.FFTMapSize],
+                       param.fft_scratch_complex[num],
+                       param.fft_scratch_real[num]);
+        doRefMap_CPU_Parallel(iRefMap, iOrient, iConv,
+                              param.fft_scratch_real[num], &comp_params[k],
+                              comp_blocks);
+      }
+      doRefMap_CPU_Reduce(iRefMap, iOrient, iConvStart, maxParallelConv,
+                          &comp_params[k], comp_blocks);
     }
+    delete[] comp_blocks;
+  }
 
   cuda_custom_timeslot_end;
-  return(0);
+  return (0);
 }
 
-inline void bioem::calculateCCFFT(int iRefMap, int iOrient, int iConv,  myfloat_t amp, myfloat_t pha, myfloat_t env, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT, mycomplex_t* localCCT, myfloat_t* lCC)
+inline void bioem::calculateCCFFT(int iRefMap, mycomplex_t *localConvFFT,
+                                  mycomplex_t *localCCT, myfloat_t *lCC)
 {
   //***************************************************************************************
-  //***** Calculating cross correlation in FFTALGOrithm *****
+  //***** Calculating cross correlation with FFT algorithm *****
 
-  for(int i = 0; i < param.param_device.NumberPixels; i++)
-    {
-      for(int j = 0; j < param.param_device.NumberPixels; j++) lCC[i * param.param_device.NumberPixels + j] = 0.f; 
-    }
+  for (int i = 0; i < param.param_device.NumberPixels; i++)
+  {
+    for (int j = 0; j < param.param_device.NumberPixels; j++)
+      lCC[i * param.param_device.NumberPixels + j] = 0.f;
+  }
+
+  const mycomplex_t *RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.FFTMapSize];
+  for (int i = 0; i < param.param_device.NumberPixels *
+                          param.param_device.NumberFFTPixels1D;
+       i++)
+  {
+    localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] +
+                     localConvFFT[i][1] * RefMapFFT[i][1];
+    localCCT[i][1] = localConvFFT[i][1] * RefMapFFT[i][0] -
+                     localConvFFT[i][0] * RefMapFFT[i][1];
+  }
+
+  myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, localCCT, lCC);
+}
 
+inline void bioem::doRefMap_CPU_Parallel(int iRefMap, int iOrient, int iConv,
+                                         myfloat_t *lCC,
+                                         myparam5_t *comp_params,
+                                         myblockCPU_t *comp_block)
+{
+  //***************************************************************************************
+  //***** Computation of log probabilities, done in parallel by OMP
 
-  const mycomplex_t* RefMapFFT = &RefMap.RefMapsFFT[iRefMap * param.FFTMapSize];
-  for(int i = 0; i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D; i++)
+  int myGlobalId = iConv * param.param_device.NtotDisp;
+  myfloat_t bestLogpro = MIN_PROB;
+  int dispC =
+      param.param_device.NumberPixels - param.param_device.maxDisplaceCenter;
+  int cent_x, cent_y, address, bestId = 0;
+  myfloat_t value, bestValue = 0.;
+  myprob_t logpro = 0., sumExp = 0.;
+
+  for (int myX = 0; myX < param.param_device.NxDisp; myX++)
+  {
+    for (int myY = 0; myY < param.param_device.NxDisp; myY++, myGlobalId++)
     {
-      localCCT[i][0] = localConvFFT[i][0] * RefMapFFT[i][0] + localConvFFT[i][1] * RefMapFFT[i][1];
-      localCCT[i][1] = localConvFFT[i][1] * RefMapFFT[i][0] - localConvFFT[i][0] * RefMapFFT[i][1];
+      cent_x = (myX * param.param_device.GridSpaceCenter + dispC) %
+               param.param_device.NumberPixels;
+      cent_y = (myY * param.param_device.GridSpaceCenter + dispC) %
+               param.param_device.NumberPixels;
+      address = cent_x * param.param_device.NumberPixels + cent_y;
+      value = (myfloat_t) lCC[address] /
+              (myfloat_t)(param.param_device.NumberPixels *
+                          param.param_device.NumberPixels);
+
+      logpro = calc_logpro(
+          param.param_device, comp_params[iConv].amp, comp_params[iConv].pha,
+          comp_params[iConv].env, comp_params[iConv].sumC,
+          comp_params[iConv].sumsquareC, value, RefMap.sum_RefMap[iRefMap],
+          RefMap.sumsquare_RefMap[iRefMap]);
+#ifdef DEBUG_PROB
+      printf("\t\t\tProb: iRefMap %d, iOrient %d, iConv %d, "
+             "cent_x %d, cent_y %d, address %d, value %f, logpro %f\n",
+             iRefMap, iOrient, iConv, cent_x, cent_y, address, value, logpro);
+#endif
+      if (bestLogpro < logpro)
+      {
+        sumExp *= exp(-logpro + bestLogpro);
+        bestLogpro = logpro;
+        bestId = myGlobalId;
+        bestValue = value;
+      }
+      sumExp += exp(logpro - bestLogpro);
     }
+  }
 
-  myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, localCCT, lCC);
- 
-  // printf("HereCC %p %f %d %d %d %d \n", &lCC[139 * param.param_device.NumberPixels + 139],lCC[139 * param.param_device.NumberPixels + 139],mpi_rank,iConv,iOrient,iRefMap);
+  comp_block[iConv].logpro = bestLogpro;
+  comp_block[iConv].sumExp = sumExp;
+  comp_block[iConv].id = bestId;
+  comp_block[iConv].value = bestValue;
+}
 
-  doRefMapFFT(iRefMap, iOrient, iConv, amp, pha, env, lCC, sumC, sumsquareC, pProb, param.param_device, RefMap);
+inline void bioem::doRefMap_CPU_Reduce(int iRefMap, int iOrient, int iConvStart,
+                                       int maxParallelConv,
+                                       myparam5_t *comp_params,
+                                       myblockCPU_t *comp_block)
+{
+  //***************************************************************************************
+  //***** Reduction of previously compututed log probabilities
 
+  bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
 
-#ifdef DEBUG
-  if (param.param_device.writeCC)
-    {      int  cc=0;
-      for (int cent_x = 0; cent_x < param.param_device.NumberPixels ; cent_x = cent_x + param.param_device.CCdisplace)
-	{
-	  for (int cent_y = 0; cent_y < param.param_device.NumberPixels ; cent_y = cent_y + param.param_device.CCdisplace)
-	    {
-	      cout << "CHECKCC " << " " << cent_x << " " << cent_y <<" " << lCC[cent_x * param.param_device.NumberPixels + cent_y] / (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels ) << "\n";
-	      cc++;
-	    }
-	}
+  for (int i = 0; i < maxParallelConv; i++)
+  {
+    if (pProbMap.Constoadd < comp_block[i].logpro)
+    {
+      pProbMap.Total *= exp(-comp_block[i].logpro + pProbMap.Constoadd);
+      pProbMap.Constoadd = comp_block[i].logpro;
+
+      // ********** Getting parameters that maximize the probability ***********
+      int myGlobalId = comp_block[i].id;
+      int myConv = myGlobalId / param.param_device.NtotDisp;
+      myGlobalId -= myConv * param.param_device.NtotDisp;
+      int myX = myGlobalId / param.param_device.NxDisp;
+      myGlobalId -= myX * param.param_device.NxDisp;
+      int myY = myGlobalId;
+
+      int dispC = param.param_device.NumberPixels -
+                  param.param_device.maxDisplaceCenter;
+      myfloat_t value = comp_block[i].value;
+
+      pProbMap.max.max_prob_cent_x =
+          -((myX * param.param_device.GridSpaceCenter + dispC) -
+            param.param_device.NumberPixels);
+      pProbMap.max.max_prob_cent_y =
+          -((myY * param.param_device.GridSpaceCenter + dispC) -
+            param.param_device.NumberPixels);
+      pProbMap.max.max_prob_orient = iOrient;
+      pProbMap.max.max_prob_conv = iConvStart + myConv;
+
+      pProbMap.max.max_prob_norm =
+          -(-comp_params[myConv].sumC * RefMap.sum_RefMap[iRefMap] +
+            param.param_device.Ntotpi * value) /
+          (comp_params[myConv].sumC * comp_params[myConv].sumC -
+           comp_params[myConv].sumsquareC * param.param_device.Ntotpi);
+      pProbMap.max.max_prob_mu =
+          -(-comp_params[myConv].sumC * value +
+            comp_params[myConv].sumsquareC * RefMap.sum_RefMap[iRefMap]) /
+          (comp_params[myConv].sumC * comp_params[myConv].sumC -
+           comp_params[myConv].sumsquareC * param.param_device.Ntotpi);
+
+#ifdef DEBUG_PROB
+      printf("\tProbabilities change: iRefMap %d, iOrient %d, iConv %d, "
+             "Total %f, Const %f, bestlogpro %f, sumExp %f, bestId %d\n",
+             iRefMap, iOrient, iConvStart + myConv, pProbMap.Total,
+             pProbMap.Constoadd, comp_block[i].logpro, comp_block[i].sumExp,
+             comp_block[i].id);
+      printf("\tParameters: myConv %d, myX %d, myY %d, cent_x -, cent_y -, "
+             "probX %d, probY %d\n",
+             myConv, myX, myY, pProbMap.max.max_prob_cent_x,
+             pProbMap.max.max_prob_cent_y);
+#endif
     }
+    pProbMap.Total +=
+        comp_block[i].sumExp * exp(comp_block[i].logpro - pProbMap.Constoadd);
+#ifdef DEBUG_PROB
+    printf("\t\tProbabilities after Reduce: iRefMap %d, iOrient %d, iConv "
+           "%d, Total %f, Const %f, bestlogpro %f, sumExp %f, bestId %d\n",
+           iRefMap, iOrient, iConvStart, pProbMap.Total, pProbMap.Constoadd,
+           comp_block[i].logpro, comp_block[i].sumExp, comp_block[i].id);
 #endif
 
+    // Code for writing angles, not used by default
+    if (param.param_device.writeAngles)
+    {
+      bioem_Probability_angle &pProbAngle =
+          pProb.getProbAngle(iRefMap, iOrient);
+      if (pProbAngle.ConstAngle < comp_block[i].logpro)
+      {
+        pProbAngle.forAngles *=
+            exp(-comp_block[i].logpro + pProbAngle.ConstAngle);
+        pProbAngle.ConstAngle = comp_block[i].logpro;
+      }
+      pProbAngle.forAngles += comp_block[i].sumExp *
+                              exp(comp_block[i].logpro - pProbAngle.ConstAngle);
+    }
+  }
 }
 
-int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
+int bioem::createProjection(int iMap, mycomplex_t *mapFFT)
 {
   // **************************************************************************************
-  // ****  BioEM Create Projection routine in Euler angles / Quaternions ******************
-  // ********************* and turns projection into Fourier space ************************
+  // ****  BioEM Create Projection routine in Euler angles / Quaternions
+  // ******************
+  // ********************* and turns projection into Fourier space
+  // ************************
   // **************************************************************************************
 
   cuda_custom_timeslot("Projection", iMap, 0, COLOR_PROJECTION);
@@ -1140,35 +1560,38 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
   myfloat3_t RotatedPointsModel[Model.nPointsModel];
   myfloat_t rotmat[3][3];
   myfloat_t alpha, gam, beta;
-  myfloat_t* localproj;
+  myfloat_t *localproj;
 
   localproj = param.fft_scratch_real[omp_get_thread_num()];
-  memset(localproj, 0, param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(*localproj));
+  memset(localproj, 0, param.param_device.NumberPixels *
+                           param.param_device.NumberPixels *
+                           sizeof(*localproj));
 
   //*************** Rotating the model ****************************
   //*************** Quaternions ****************************
-  if(param.doquater){
+  if (param.doquater)
+  {
 
     myfloat_t quater[4];
-    //quaternion
-    quater[0]=param.angles[iMap].pos[0];
-    quater[1]=param.angles[iMap].pos[1];
-    quater[2]=param.angles[iMap].pos[2];  
-    quater[3]=param.angles[iMap].quat4;
-
-    //Rotation Matrix for Quaterions (wikipeda)
-    rotmat[0][0] = 1- 2 * quater[1] * quater[1]  - 2 * quater[2] * quater[2];
-    rotmat[1][0] = 2 * ( quater[0] *  quater[1] -  quater[2] *  quater[3]);
-    rotmat[2][0] =  2 * ( quater[0] *  quater[2] +  quater[1] *  quater[3]);
-    rotmat[0][1] = 2 * ( quater[0] *  quater[1] +  quater[2] *  quater[3]);
-    rotmat[1][1] = 1- 2 * quater[0] * quater[0]  - 2 * quater[2] * quater[2];
-    rotmat[2][1] =  2 * ( quater[1] *  quater[2] -  quater[0] *  quater[3]);
-    rotmat[0][2] = 2 * ( quater[0] *  quater[2] -  quater[1] *  quater[3]);
-    rotmat[1][2] = 2 * ( quater[1] *  quater[2] +  quater[0] *  quater[3]);
-    rotmat[2][2] = 1- 2 * quater[0] * quater[0]  - 2 * quater[1] * quater[1];
- 
-
-  } else{
+    // quaternion
+    quater[0] = param.angles[iMap].pos[0];
+    quater[1] = param.angles[iMap].pos[1];
+    quater[2] = param.angles[iMap].pos[2];
+    quater[3] = param.angles[iMap].quat4;
+
+    // Rotation Matrix for Quaterions (wikipeda)
+    rotmat[0][0] = 1 - 2 * quater[1] * quater[1] - 2 * quater[2] * quater[2];
+    rotmat[1][0] = 2 * (quater[0] * quater[1] - quater[2] * quater[3]);
+    rotmat[2][0] = 2 * (quater[0] * quater[2] + quater[1] * quater[3]);
+    rotmat[0][1] = 2 * (quater[0] * quater[1] + quater[2] * quater[3]);
+    rotmat[1][1] = 1 - 2 * quater[0] * quater[0] - 2 * quater[2] * quater[2];
+    rotmat[2][1] = 2 * (quater[1] * quater[2] - quater[0] * quater[3]);
+    rotmat[0][2] = 2 * (quater[0] * quater[2] - quater[1] * quater[3]);
+    rotmat[1][2] = 2 * (quater[1] * quater[2] + quater[0] * quater[3]);
+    rotmat[2][2] = 1 - 2 * quater[0] * quater[0] - 2 * quater[1] * quater[1];
+  }
+  else
+  {
 
     //*************** Euler Angles****************************
     // Doing Euler angles instead of Quaternions
@@ -1176,9 +1599,10 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
     beta = param.angles[iMap].pos[1];
     gam = param.angles[iMap].pos[2];
 
-    //*** To see how things are going:
-#ifdef DEBUG 
-    cout << "Id " << omp_get_thread_num() <<  " Angs: " << alpha << " " << beta << " " << gam << "\n"; 
+//*** To see how things are going:
+#ifdef DEBUG
+    cout << "Id " << omp_get_thread_num() << " Angs: " << alpha << " " << beta
+         << " " << gam << "\n";
 #endif
     // ********** Creat Rotation with pre-defiend grid of orientations**********
     // Same notation as in Goldstein and Mathematica
@@ -1191,308 +1615,446 @@ int bioem::createProjection(int iMap, mycomplex_t* mapFFT)
     rotmat[2][0] = sin(beta) * sin(alpha);
     rotmat[2][1] = -sin(beta) * cos(alpha);
     rotmat[2][2] = cos(beta);
+  }
 
-}
-  
-// The rotation matrix is calculated either for the quaternions or for the euler angles
-  for(int n = 0; n < Model.nPointsModel; n++)
-      {
-	RotatedPointsModel[n].pos[0] = 0.0;
-	RotatedPointsModel[n].pos[1] = 0.0;
-	RotatedPointsModel[n].pos[2] = 0.0;
-      }
-    for(int n = 0; n < Model.nPointsModel; n++)
+  // The rotation matrix is calculated either for the quaternions or for the
+  // euler angles
+  for (int n = 0; n < Model.nPointsModel; n++)
+  {
+    RotatedPointsModel[n].pos[0] = 0.0;
+    RotatedPointsModel[n].pos[1] = 0.0;
+    RotatedPointsModel[n].pos[2] = 0.0;
+  }
+  for (int n = 0; n < Model.nPointsModel; n++)
+  {
+    for (int k = 0; k < 3; k++)
+    {
+      for (int j = 0; j < 3; j++)
       {
-	for(int k = 0; k < 3; k++)
-	  {
-	    for(int j = 0; j < 3; j++)
-	      {
-		RotatedPointsModel[n].pos[k] += rotmat[k][j] * Model.points[n].point.pos[j];
-	      }
-	  }
+        RotatedPointsModel[n].pos[k] +=
+            rotmat[k][j] * Model.points[n].point.pos[j];
       }
+    }
+  }
 
-  
-
-  if(param.printrotmod) {
-    for(int n = 0; n < Model.nPointsModel; n++) cout << "ROTATED " << iMap << " " << n <<" "<< RotatedPointsModel[n].pos[0] << " " << RotatedPointsModel[n].pos[1] << " " <<  RotatedPointsModel[n].pos[2] << "\n";
-
+  if (param.printrotmod)
+  {
+    for (int n = 0; n < Model.nPointsModel; n++)
+      cout << "ROTATED " << iMap << " " << n << " "
+           << RotatedPointsModel[n].pos[0] << " "
+           << RotatedPointsModel[n].pos[1] << " "
+           << RotatedPointsModel[n].pos[2] << "\n";
   }
   int i, j;
 
-  //*************** Creating projection **************************** 
+  //*************** Creating projection ****************************
   //********** Projection with radius ***************
   int irad;
   myfloat_t dist, rad2;
 
-  myfloat_t tempden=0.0;
+  myfloat_t tempden = 0.0;
 
-  for(int n = 0; n < Model.nPointsModel; n++)
+  for (int n = 0; n < Model.nPointsModel; n++)
+  {
+    if (Model.points[n].radius <= param.pixelSize)
     {
-      if(Model.points[n].radius <= param.pixelSize){
-	//   cout << "Radius less than Pixel size: use keyword NO_PROJECT_RADIUS in inputfile\n";
-	i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
-	j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
-
-	if (i < 0 || j < 0 || i >= param.param_device.NumberPixels || j >= param.param_device.NumberPixels)
-	  {
-	    if (DebugOutput >= 0) cout << "WARNING:::: Model Point out of Projection map: " << i << ", " << j << "\n";
-	    //              continue;
-	    if(not param.ignorepointsout)exit(1);
-	  }
-
-	localproj[i * param.param_device.NumberPixels + j] += Model.points[n].density;
-	tempden += Model.points[n].density;
-
-	// exit(1);
-      }else{
-
-	//Getting Centers of Sphere
-	i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f) -param.shiftX;
-	j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize + (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f) -param.shiftY;
-	//Getting the radius
-	irad=int( Model.points[n].radius / param.pixelSize ) + 1;
-	rad2= Model.points[n].radius * Model.points[n].radius;
-
-	if (i < 0 || j < 0 || i >= param.param_device.NumberPixels || j >= param.param_device.NumberPixels)
-	  {
-	    if (DebugOutput >= 0) cout << "WARNING::: Model Point out of Projection map: " << i << ", " << j << "\n";
-	    cout << "Model point " << n << "Rotation: " << iMap <<" "<< RotatedPointsModel[n].pos[0] << " " << RotatedPointsModel[n].pos[1] << " " <<  RotatedPointsModel[n].pos[2] << "\n";
-	    cout << "Original coor " << n <<" " << Model.points[n].point.pos[0] << " " << Model.points[n].point.pos[1] << " " <<Model.points[n].point.pos[2] << "\n";
-	    cout << "WARNING: Angle orient " << n << " " <<  param.angles[iMap].pos[0] << " " <<  param.angles[iMap].pos[1] << " " << param.angles[iMap].pos[2] <<  " out " << i << " " << j << "\n";
-	    cout << "WARNING: MPI rank " << mpi_rank <<"\n";
-	    //              continue;
-	    if(not param.ignorepointsout)exit(1);
-	  }
-
-
-	//Projecting over the radius
-	for(int ii= i - irad; ii < i + irad + 1 ; ii++)
-	  {	
-	    for(int jj = j - irad; jj < j + irad + 1 ; jj++)
-	      {
-		dist= ( (myfloat_t) (ii-i)*(ii-i)+(jj-j)*(jj-j) ) *  param.pixelSize *  param.pixelSize ; //at pixel center
-		if( dist < rad2 )
-		  {
-		    localproj[ii * param.param_device.NumberPixels + jj] += param.pixelSize * param.pixelSize * 2 * sqrt( rad2 - dist ) * Model.points[n].density
-		      * 3 / (4 * M_PI * Model.points[n].radius * rad2 );
-		    tempden += param.pixelSize * param.pixelSize * 2 * sqrt( rad2 - dist ) * Model.points[n].density
-                      * 3 / (4 * M_PI * Model.points[n].radius * rad2 ); 
-		  }
-	      }
-	  }
+      //   cout << "Radius less than Pixel size: use keyword NO_PROJECT_RADIUS
+      //   in inputfile\n";
+      i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize +
+                (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+      j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize +
+                (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f);
+
+      if (i < 0 || j < 0 || i >= param.param_device.NumberPixels ||
+          j >= param.param_device.NumberPixels)
+      {
+        if (DebugOutput >= 0)
+          cout << "WARNING:::: Model Point out of Projection map: " << i << ", "
+               << j << "\n";
+        //              continue;
+        if (not param.ignorepointsout)
+          exit(1);
       }
 
+      localproj[i * param.param_device.NumberPixels + j] +=
+          Model.points[n].density;
+      tempden += Model.points[n].density;
+
+      // exit(1);
     }
+    else
+    {
 
-	// To avoid numerical mismatch in projection errors we normalize by the initial density
+      // Getting Centers of Sphere
+      i = floor(RotatedPointsModel[n].pos[0] / param.pixelSize +
+                (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f) -
+          param.shiftX;
+      j = floor(RotatedPointsModel[n].pos[1] / param.pixelSize +
+                (myfloat_t) param.param_device.NumberPixels / 2.0f + 0.5f) -
+          param.shiftY;
+      // Getting the radius
+      irad = int(Model.points[n].radius / param.pixelSize) + 1;
+      rad2 = Model.points[n].radius * Model.points[n].radius;
+
+      if (i < 0 || j < 0 || i >= param.param_device.NumberPixels ||
+          j >= param.param_device.NumberPixels)
+      {
+        if (DebugOutput >= 0)
+          cout << "WARNING::: Model Point out of Projection map: " << i << ", "
+               << j << "\n";
+        cout << "Model point " << n << "Rotation: " << iMap << " "
+             << RotatedPointsModel[n].pos[0] << " "
+             << RotatedPointsModel[n].pos[1] << " "
+             << RotatedPointsModel[n].pos[2] << "\n";
+        cout << "Original coor " << n << " " << Model.points[n].point.pos[0]
+             << " " << Model.points[n].point.pos[1] << " "
+             << Model.points[n].point.pos[2] << "\n";
+        cout << "WARNING: Angle orient " << n << " "
+             << param.angles[iMap].pos[0] << " " << param.angles[iMap].pos[1]
+             << " " << param.angles[iMap].pos[2] << " out " << i << " " << j
+             << "\n";
+        cout << "WARNING: MPI rank " << mpi_rank << "\n";
+        //              continue;
+        if (not param.ignorepointsout)
+          exit(1);
+      }
 
-    myfloat_t ratioDen;
-  
-   ratioDen = Model.NormDen / tempden ;
+      // Projecting over the radius
+      for (int ii = i - irad; ii < i + irad + 1; ii++)
+      {
+        for (int jj = j - irad; jj < j + irad + 1; jj++)
+        {
+          dist = ((myfloat_t)(ii - i) * (ii - i) + (jj - j) * (jj - j)) *
+                 param.pixelSize * param.pixelSize; // at pixel center
+          if (dist < rad2)
+          {
+            localproj[ii * param.param_device.NumberPixels + jj] +=
+                param.pixelSize * param.pixelSize * 2 * sqrt(rad2 - dist) *
+                Model.points[n].density * 3 /
+                (4 * M_PI * Model.points[n].radius * rad2);
+            tempden += param.pixelSize * param.pixelSize * 2 *
+                       sqrt(rad2 - dist) * Model.points[n].density * 3 /
+                       (4 * M_PI * Model.points[n].radius * rad2);
+          }
+        }
+      }
+    }
+  }
+
+  // To avoid numerical mismatch in projection errors we normalize by the
+  // initial density
 
-   for(int i = 0; i < param.param_device.NumberPixels ; i++){
-	 for(int j = 0; j < param.param_device.NumberPixels ; j++){
-		localproj[ i * param.param_device.NumberPixels + j] *= ratioDen;
-		}
-	} 
+  myfloat_t ratioDen;
+
+  ratioDen = Model.NormDen / tempden;
+
+  for (int i = 0; i < param.param_device.NumberPixels; i++)
+  {
+    for (int j = 0; j < param.param_device.NumberPixels; j++)
+    {
+      localproj[i * param.param_device.NumberPixels + j] *= ratioDen;
+    }
+  }
 
-  // **** Output Just to check****
+// **** Output Just to check****
 #ifdef DEBUG
   //  if(iMap == 0)
   {
     ofstream myexamplemap;
     ofstream myexampleRot;
-    myexamplemap.open ("MAP_i10");
-    myexampleRot.open ("Rot_i10");
+    myexamplemap.open("MAP_i10");
+    myexampleRot.open("Rot_i10");
     myexamplemap << "ANGLES " << alpha << " " << beta << " " << gam << "\n";
-    for(int k = 0; k < param.param_device.NumberPixels; k++)
-      {
-	for(int j = 0; j < param.param_device.NumberPixels; j++) myexamplemap << "\nMAP " << k << " " << j << " " << localproj[k * param.param_device.NumberPixels + j];
-      }
+    for (int k = 0; k < param.param_device.NumberPixels; k++)
+    {
+      for (int j = 0; j < param.param_device.NumberPixels; j++)
+        myexamplemap << "\nMAP " << k << " " << j << " "
+                     << localproj[k * param.param_device.NumberPixels + j];
+    }
     myexamplemap << " \n";
-    for(int n = 0; n < Model.nPointsModel; n++)myexampleRot << "\nCOOR " << RotatedPointsModel[n].pos[0] << " " << RotatedPointsModel[n].pos[1] << " " << RotatedPointsModel[n].pos[2];
+    for (int n = 0; n < Model.nPointsModel; n++)
+      myexampleRot << "\nCOOR " << RotatedPointsModel[n].pos[0] << " "
+                   << RotatedPointsModel[n].pos[1] << " "
+                   << RotatedPointsModel[n].pos[2];
     myexamplemap.close();
     myexampleRot.close();
   }
 #endif
 
-  // ***** Converting projection to Fourier Space for Convolution later with kernel****
+  // ***** Converting projection to Fourier Space for Convolution later with
+  // kernel****
   // ********** Omp Critical is necessary with FFTW*******
 
   myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localproj, mapFFT);
 
   cuda_custom_timeslot_end;
 
-  return(0);
+  return (0);
 }
 
-int bioem::createConvolutedProjectionMap(int iMap, int iConv, mycomplex_t* lproj, myfloat_t* Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC)
+int bioem::createConvolutedProjectionMap(int iMap, int iConv,
+                                         mycomplex_t *lproj,
+                                         mycomplex_t *localmultFFT,
+                                         myfloat_t &sumC, myfloat_t &sumsquareC)
 {
   // **************************************************************************************
-  // ****  BioEM Create Convoluted Projection Map routine, multiplies in Fourier **********
-  // **************** calculated Projection with convoluted precalculated Kernel***********
-  // *************** and Backtransforming it to real Space ********************************
+  // ****  BioEM Create Convoluted Projection Map routine, multiplies in Fourier
+  // **********
+  // **************** calculated Projection with convoluted precalculated
+  // Kernel***********
+  // *************** and Backtransforming it to real Space
+  // ********************************
   // **************************************************************************************
 
-
   cuda_custom_timeslot("Convolution", iMap, iConv, COLOR_CONVOLUTION);
 
-  mycomplex_t* tmp = param.fft_scratch_complex[omp_get_thread_num()];
-
   // **** Multiplying FFTmap of model with corresponding kernel *******
 
-  const mycomplex_t* refCTF = &param.refCTF[iConv * param.FFTMapSize];
+  const mycomplex_t *refCTF = &param.refCTF[iConv * param.FFTMapSize];
 
-  for(int i = 0; i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D; i++)
-    {
-      localmultFFT[i][0] = ( lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1] ) ;
-      localmultFFT[i][1] = ( lproj[i][1] * refCTF[i][0] - lproj[i][0] * refCTF[i][1] ) ;
-    }
+  for (int i = 0; i < param.param_device.NumberPixels *
+                          param.param_device.NumberFFTPixels1D;
+       i++)
+  {
+    localmultFFT[i][0] =
+        (lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1]);
+    localmultFFT[i][1] =
+        (lproj[i][1] * refCTF[i][0] - lproj[i][0] * refCTF[i][1]);
+  }
 
-  // *** Calculating Cross-correlations of cal-convoluted map with its self ***** (for BioEM formula)
+  // *** Calculating Cross-correlations of cal-convoluted map with its self
+  // ***** (for BioEM formula)
   sumC = localmultFFT[0][0];
 
   //**** Calculating the second norm and storing it (for BioEM formula)
   sumsquareC = 0;
-  if (FFTAlgo)
+
+  //*** With FFT algorithm
+  int jloopend = param.param_device.NumberFFTPixels1D;
+  if ((param.param_device.NumberPixels & 1) == 0)
+    jloopend--;
+  for (int i = 0; i < param.param_device.NumberPixels; i++)
+  {
+    for (int j = 1; j < jloopend; j++)
     {
-      //*** With FFT algorithm
-      int jloopend = param.param_device.NumberFFTPixels1D;
-      if ((param.param_device.NumberPixels & 1) == 0) jloopend--;
-      for(int i = 0; i < param.param_device.NumberPixels; i++)
-	{
-	  for (int j = 1;j < jloopend;j++)
-	    {
-	      int k = i * param.param_device.NumberFFTPixels1D + j;
-	      sumsquareC += (localmultFFT[k][0] * localmultFFT[k][0] + localmultFFT[k][1] * localmultFFT[k][1]) * 2;
-	    }
-	  int k = i * param.param_device.NumberFFTPixels1D;
-	  sumsquareC += localmultFFT[k][0] * localmultFFT[k][0] + localmultFFT[k][1] * localmultFFT[k][1];
-	  if ((param.param_device.NumberPixels & 1) == 0)
-	    {
-	      k += param.param_device.NumberFFTPixels1D - 1;
-	      sumsquareC += localmultFFT[k][0] * localmultFFT[k][0] + localmultFFT[k][1] * localmultFFT[k][1];
-	    }
-	}
-
-      myfloat_t norm2 = (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels);
-      sumsquareC = sumsquareC / norm2;
+      int k = i * param.param_device.NumberFFTPixels1D + j;
+      sumsquareC += (localmultFFT[k][0] * localmultFFT[k][0] +
+                     localmultFFT[k][1] * localmultFFT[k][1]) *
+                    2;
     }
-  else
+    int k = i * param.param_device.NumberFFTPixels1D;
+    sumsquareC += localmultFFT[k][0] * localmultFFT[k][0] +
+                  localmultFFT[k][1] * localmultFFT[k][1];
+    if ((param.param_device.NumberPixels & 1) == 0)
     {
-      //***** Slow No FFT ***
+      k += param.param_device.NumberFFTPixels1D - 1;
+      sumsquareC += localmultFFT[k][0] * localmultFFT[k][0] +
+                    localmultFFT[k][1] * localmultFFT[k][1];
+    }
+  }
 
-      //**** Backtransforming the convoluted map it into real space
-      //FFTW_C2R will destroy the input array, so we have to work on a copy here
-      memcpy(tmp, localmultFFT, sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
+  myfloat_t norm2 = (myfloat_t)(param.param_device.NumberPixels *
+                                param.param_device.NumberPixels);
+  sumsquareC = sumsquareC / norm2;
 
-      // **** Bringing convoluted Map to real Space ****
-      myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, tmp, Mapconv);
+  cuda_custom_timeslot_end;
 
-      for(int i = 0; i < param.param_device.NumberPixels * param.param_device.NumberPixels; i++)
-	{
-	  sumsquareC += Mapconv[i] * Mapconv[i];
-	  //	  cout << "CONV " << i << " " << Mapconv[i] << "\n";
-	}
+  return (0);
+}
 
-      myfloat_t norm2 = (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels);
-      myfloat_t norm4 = norm2 * norm2;
-      sumsquareC = sumsquareC / norm4;
-    }
+int bioem::createConvolutedProjectionMap_noFFT(mycomplex_t *lproj,
+                                               myfloat_t *Mapconv,
+                                               mycomplex_t *localmultFFT,
+                                               myfloat_t &sumC,
+                                               myfloat_t &sumsquareC)
+{
+  // **************************************************************************************
+  // ****  BioEM Create Convoluted Projection Map routine, multiplies in Fourier
+  // **********
+  // **************** calculated Projection with convoluted precalculated
+  // Kernel***********
+  // *************** and Backtransforming it to real Space
+  // ********************************
+  // **************************************************************************************
+  // *************** This routine is only for printing Model
+  // ******************************
+  // **************************************************************************************
+
+  mycomplex_t *tmp = param.fft_scratch_complex[omp_get_thread_num()];
+
+  // **** Multiplying FFTmap of model with corresponding kernel *******
+  const mycomplex_t *refCTF = param.refCTF;
+
+  for (int i = 0; i < param.param_device.NumberPixels *
+                          param.param_device.NumberFFTPixels1D;
+       i++)
+  {
+    localmultFFT[i][0] =
+        (lproj[i][0] * refCTF[i][0] + lproj[i][1] * refCTF[i][1]);
+    localmultFFT[i][1] =
+        (lproj[i][1] * refCTF[i][0] - lproj[i][0] * refCTF[i][1]);
+  }
 
+  // *** Calculating Cross-correlations of cal-convoluted map with its self
+  // ***** (for BioEM formula)
+  sumC = localmultFFT[0][0];
+
+  //**** Calculating the second norm and storing it (for BioEM formula)
+  sumsquareC = 0;
+
+  //***** Slow No FFT ***
+  //**** Backtransforming the convoluted map it into real space
+  // FFTW_C2R will destroy the input array, so we have to work on a copy here
+  memcpy(tmp, localmultFFT, sizeof(mycomplex_t) *
+                                param.param_device.NumberPixels *
+                                param.param_device.NumberFFTPixels1D);
+
+  // **** Bringing convoluted Map to real Space ****
+  myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, tmp, Mapconv);
+
+  for (int i = 0;
+       i < param.param_device.NumberPixels * param.param_device.NumberPixels;
+       i++)
+  {
+    sumsquareC += Mapconv[i] * Mapconv[i];
+    //	  cout << "CONV " << i << " " << Mapconv[i] << "\n";
+  }
+
+  myfloat_t norm2 = (myfloat_t)(param.param_device.NumberPixels *
+                                param.param_device.NumberPixels);
+  myfloat_t norm4 = norm2 * norm2;
+  sumsquareC = sumsquareC / norm4;
 
   // **************************************************************************************
-  // *********** Routine for printing out the best projetion ******************************
+  // *********** Routine for printing out the best projetion
+  // ******************************
   // **************************************************************************************
 
-  if (mpi_rank == 0 && param.printModel)
-    {
-//      MTRand mtr;
-	bran::mt19937 gen;
-	//Generating random seed so the maps do not have correlated Noise
-  	gen.seed(static_cast<unsigned int>(std::time(0)));
-	//Uniform Noise: bran::uniform_int_distribution<> dist(1, 6);
-
-	//Gaussian noise
- 	 bran::normal_distribution <> distn(0.0,param.stnoise);
-
-      memcpy(tmp, localmultFFT, sizeof(mycomplex_t) * param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D);
-
-      // **** Bringing convoluted Map to real Space ****
-      myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, tmp, Mapconv);
-
-      myfloat_t norm2 = (myfloat_t) (param.param_device.NumberPixels * param.param_device.NumberPixels);
-
-      ofstream myexamplemap;
-      myexamplemap.open ("BESTMAP");
-      for(int k = 0; k < param.param_device.NumberPixels; k++)
-	{
-	  for(int j = 0; j < param.param_device.NumberPixels; j++) {
-	    if(!param.withnoise){
-	      myexamplemap << "\nMAP " << k+param.ddx << " " << j+param.ddy << " " <<  Mapconv[k * param.param_device.NumberPixels + j] / norm2 *param.bestnorm +param.bestoff ; 
-	    } else {
-		myexamplemap << "\nMAP " << k+param.ddx << " " << j+param.ddy << " " <<  Mapconv[k * param.param_device.NumberPixels + j] / norm2 *param.bestnorm +param.bestoff+distn(gen);
-//		cout << distn(gen) << "CHECK\n";
-	    }
-	  }
-	  myexamplemap << " \n";
-	}
-      myexamplemap.close();
-
-      cout << "\n\nBest map printed in file: BESTMAP with gnuplot format in columns 2, 3 and 4. \n\n\n";
-      exit(1);
+  // Calling random number routine from MersenneTwister.h
+  MTRand mtr;
 
-    }
+  // Generating random seed so the maps do not have correlated Noise
+  mtr.seed(static_cast<unsigned int>(std::time(0)));
 
-  cuda_custom_timeslot_end;
+  memcpy(tmp, localmultFFT, sizeof(mycomplex_t) *
+                                param.param_device.NumberPixels *
+                                param.param_device.NumberFFTPixels1D);
+
+  // **** Bringing convoluted Map to real Space ****
+  myfftw_execute_dft_c2r(param.fft_plan_c2r_backward, tmp, Mapconv);
+
+  // Calculating the cross-correlation to the ref maps
+  // PILAR WORK RefMap.maps
+  if (param.BestmapCalcCC)
+  {
+    myfloat_t ccbm = 0.;
+    int kk, jj;
+
+    for (int k = 0; k < param.param_device.NumberPixels; k++)
+    {
+      for (int j = 0; j < param.param_device.NumberPixels; j++)
+      {
+        // Missing periodicity and centers;
+        kk = k;
+        jj = j;
+        if (k - param.ddx < 0)
+          kk = param.param_device.NumberPixels - (k - param.ddx);
+        if (j - param.ddy < 0)
+          jj = param.param_device.NumberPixels - (j - param.ddy);
+        if (k - param.ddx >= param.param_device.NumberPixels)
+          kk = k - param.ddx - param.param_device.NumberPixels;
+        if (j - param.ddy >= param.param_device.NumberPixels)
+          jj = j - param.ddy - param.param_device.NumberPixels;
+
+        ccbm += (Mapconv[kk * param.param_device.NumberPixels + jj] / norm2 *
+                     param.bestnorm -
+                 RefMap.maps[k * param.param_device.NumberPixels + j]) *
+                (Mapconv[kk * param.param_device.NumberPixels + jj] / norm2 *
+                     param.bestnorm -
+                 RefMap.maps[k * param.param_device.NumberPixels + j]);
+      }
+    }
+    cout << "CROSS CORELATION " << ccbm << "\n";
+  }
+  else
+  {
+    ofstream myexamplemap;
+    myexamplemap.open("BESTMAP");
+    for (int k = 0; k < param.param_device.NumberPixels; k++)
+    {
+      for (int j = 0; j < param.param_device.NumberPixels; j++)
+      {
+        if (!param.withnoise)
+        {
+          myexamplemap << "\nMAP " << k + param.ddx << " " << j + param.ddy
+                       << " "
+                       << Mapconv[k * param.param_device.NumberPixels + j] /
+                                  norm2 * param.bestnorm +
+                              param.bestoff;
+          if (k + param.ddx < param.param_device.NumberPixels &&
+              j + param.ddy < param.param_device.NumberPixels)
+          {
+            myexamplemap
+                << "\nMAPddx " << k << " " << j << " "
+                << Mapconv[(k - param.ddx) * param.param_device.NumberPixels +
+                           j - param.ddy] /
+                           norm2 * param.bestnorm +
+                       param.bestoff;
+          }
+        }
+        else
+        {
+          myexamplemap << "\nMAP " << k + param.ddx << " " << j + param.ddy
+                       << " "
+                       << Mapconv[k * param.param_device.NumberPixels + j] /
+                                  norm2 * param.bestnorm +
+                              param.bestoff +
+                              mtr.randNorm(0.0,
+                                           param.stnoise); //\\+ distn(gen);
+          //		cout << distn(gen) << "CHECK\n";
+        }
+      }
+      myexamplemap << " \n";
+    }
+    myexamplemap.close();
 
-  return(0);
+    cout << "\n\nBest map printed in file: BESTMAP with gnuplot format in "
+            "columns 2, 3 and 4. \n\n\n";
+  }
+  return (0);
 }
 
-int bioem::calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare)
+int bioem::calcross_cor(myfloat_t *localmap, myfloat_t &sum,
+                        myfloat_t &sumsquare)
 {
-  // *********************** Routine to calculate Cross correlations***********************
+  // *********************** Routine to calculate Cross
+  // correlations***********************
 
   sum = 0.0;
   sumsquare = 0.0;
   for (int i = 0; i < param.param_device.NumberPixels; i++)
+  {
+    for (int j = 0; j < param.param_device.NumberPixels; j++)
     {
-      for (int j = 0; j < param.param_device.NumberPixels; j++)
-	{
-	  // Calculate Sum of pixels
-	  sum += localmap[i * param.param_device.NumberPixels + j];
-	  // Calculate Sum of pixels squared
-	  sumsquare += localmap[i * param.param_device.NumberPixels + j] * localmap[i * param.param_device.NumberPixels + j];
-	}
+      // Calculate Sum of pixels
+      sum += localmap[i * param.param_device.NumberPixels + j];
+      // Calculate Sum of pixels squared
+      sumsquare += localmap[i * param.param_device.NumberPixels + j] *
+                   localmap[i * param.param_device.NumberPixels + j];
     }
-  return(0);
+  }
+  return (0);
 }
 
-int bioem::deviceInit()
-{
-  return(0);
-}
+int bioem::deviceInit() { return (0); }
 
-int bioem::deviceStartRun()
-{
-  return(0);
-}
+int bioem::deviceStartRun() { return (0); }
 
-int bioem::deviceFinishRun()
-{
-  return(0);
-}
+int bioem::deviceFinishRun() { return (0); }
 
-void* bioem::malloc_device_host(size_t size)
-{
-  return(mallocchk(size));
-}
+void *bioem::malloc_device_host(size_t size) { return (mallocchk(size)); }
 
-void bioem::free_device_host(void* ptr)
-{
-  free(ptr);
-}
+void bioem::free_device_host(void *ptr) { free(ptr); }
 
 void bioem::rebalanceWrapper(int workload)
 {
diff --git a/bioem_algorithm.h b/bioem_algorithm.h
index c1936efe85e6e38998f7c1f5b4caf566fec361df..494ba6599d51fd63d27b895070beca3ae5ffe1fb 100644
--- a/bioem_algorithm.h
+++ b/bioem_algorithm.h
@@ -1,505 +1,200 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 
 #ifndef BIOEM_ALGORITHM_H
 #define BIOEM_ALGORITHM_H
-//#include <boost/iterator/iterator_concepts.hpp>
 
-#ifndef BIOEM_GPUCODE
-//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code, only implemented for float, not for double.
-#endif
-
-#ifdef SSECODE
-#include <emmintrin.h>
-#include <smmintrin.h>
-#endif
-
-template <int GPUAlgo>
-__device__ static inline void update_prob(const myfloat_t logpro, const int iRefMap, const int iOrient, const int iConv, const int cent_x, const int cent_y, bioem_Probability& pProb, bool doAngle, myfloat_t* buf3 = NULL, int* bufint = NULL)
-{
-
-  // *********** Not using FFT ALGORITHM ******************
-  //*********** Routine to perform the numerical BioEM intergal ***********
-
-  // *******  Summing total Probabilities *************
-
-  bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
-
-  // ******* Need a constant because of numerical divergence*****
-  if(pProbMap.Constoadd < logpro)
-    {
-      pProbMap.Total = pProbMap.Total * exp(-logpro + pProbMap.Constoadd);
-      pProbMap.Constoadd = logpro;
-		
-      // ********** Getting parameters that maximize the probability ***********
-      if (GPUAlgo == 2)
-	{
-	  bufint[0] = 1;
-	  buf3[1] = logpro;
-	}
-      else
-	{
-	  pProbMap.max.max_prob_cent_x = - cent_x;
-	  pProbMap.max.max_prob_cent_y = - cent_y;
-	}
-      pProbMap.max.max_prob_orient = iOrient;
-      pProbMap.max.max_prob_conv = iConv;
-     // pProbMap.max.max_prob_norm = - ( -sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value ) / ( sumC * sumC - sumsquareC *  param.Ntotpi);
-     //  pProbMap.max.max_prob_mu = - ( -sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap] ) / ( sumC * sumC - sumsquareC *  param.Ntotpi);
-    }
-  if (GPUAlgo != 2) pProbMap.Total += exp(logpro - pProbMap.Constoadd);
-
-  if (doAngle)
-    {
-      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
-
-      //Summing probabilities for each orientation
-      if(pProbAngle.ConstAngle < logpro)
-	{
-	  pProbAngle.forAngles = pProbAngle.forAngles * exp(-logpro + pProbAngle.ConstAngle);
-	  pProbAngle.ConstAngle = logpro;
-	}
-
-      if (GPUAlgo != 2) pProbAngle.forAngles += exp(logpro - pProbAngle.ConstAngle);
-    }
-}
-
-__device__ static inline myfloat_t calc_logpro(const bioem_param_device& param, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sum, const myfloat_t sumsquare, const myfloat_t crossproMapConv, const myfloat_t sumref, const myfloat_t sumsquareref)
+__device__ static inline myprob_t
+calc_logpro(const bioem_param_device &param, const myfloat_t amp,
+            const myfloat_t pha, const myfloat_t env, const myfloat_t sum,
+            const myfloat_t sumsquare, const myfloat_t crossproMapConv,
+            const myfloat_t sumref, const myfloat_t sumsquareref)
 {
 
   //*** MAIN ROUTINE TO CALCULATE THE LOGPRO FOR ALL KERNELS*************//
   // **** calculate the log posterior of Eq. of Pmw in SI of JSB paper ***//
 
   // Related to Reference calculated Projection
-  const myfloat_t ForLogProb = (sumsquare * param.Ntotpi - sum * sum);
+  const myprob_t ForLogProb = sumsquare * param.Ntotpi - sum * sum;
 
   // Products of different cross-correlations (first element in formula)
-  const myfloat_t firstele = param.Ntotpi * (sumsquareref * sumsquare - crossproMapConv * crossproMapConv) +
-    2 * sumref * sum * crossproMapConv - sumsquareref * sum * sum - sumref * sumref * sumsquare;
+  const myprob_t firstele =
+      param.Ntotpi *
+          (sumsquareref * sumsquare - crossproMapConv * crossproMapConv) +
+      2 * sumref * sum * crossproMapConv - sumsquareref * sum * sum -
+      sumref * sumref * sumsquare;
 
   /// ******* Calculating log of Prob*********
-  // As in fortran code: logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
-
-  myfloat_t logpro = (3 - param.Ntotpi) * 0.5 * log(firstele) + (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
-
-  //*************Adding Gaussian Prior to envelope & Defocus parameter******************
-
-  if(not param.tousepsf){
-    logpro = logpro - env * env / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
-        (pha - param.Priordefcent ) * (pha - param.Priordefcent ) / 2. / param.sigmaPriordefo / param.sigmaPriordefo ;
-  } else {
-    myfloat_t envF,phaF;
-    envF = 4.* M_PI * M_PI * env / ( env * env + pha * pha) ;
-    phaF = 4.* M_PI * M_PI * pha / ( env * env + pha * pha);
-    logpro = logpro - envF * envF / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf - (phaF - param.Priordefcent ) * (phaF - param.Priordefcent ) / 2. / param.sigmaPriordefo / param.sigmaPriordefo ;
+  // As in fortran code:
+  // logpro=(3-Ntotpi)*0.5*log(firstele/pConvMap[iOrient].ForLogProbfromConv[iConv])+(Ntotpi*0.5-2)*log(Ntotpi-2)-0.5*log(pConvMap[iOrient].ForLogProbfromConv[iConv])+0.5*log(PI)+(1-Ntotpi*0.5)*(log(2*PI)+1);
+
+  myprob_t logpro =
+      (3 - param.Ntotpi) * 0.5 * log(firstele) +
+      (param.Ntotpi * 0.5 - 2) * log((param.Ntotpi - 2) * ForLogProb);
+
+  //*************Adding Gaussian Prior to envelope & Defocus
+  // parameter******************
+
+  if (not param.tousepsf)
+  {
+    logpro -= env * env / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
+              (pha - param.Priordefcent) * (pha - param.Priordefcent) / 2. /
+                  param.sigmaPriordefo / param.sigmaPriordefo -
+              (amp - param.Priorampcent) * (amp - param.Priorampcent) / 2. /
+                  param.sigmaPrioramp / param.sigmaPrioramp;
+  }
+  else
+  {
+    myprob_t envF, phaF;
+    envF = 4. * M_PI * M_PI * env / (env * env + pha * pha);
+    phaF = 4. * M_PI * M_PI * pha / (env * env + pha * pha);
+    logpro -= envF * envF / 2. / param.sigmaPriorbctf / param.sigmaPriorbctf -
+              (phaF - param.Priordefcent) * (phaF - param.Priordefcent) / 2. /
+                  param.sigmaPriordefo / param.sigmaPriordefo -
+              (amp - param.Priorampcent) * (amp - param.Priorampcent) / 2. /
+                  param.sigmaPrioramp / param.sigmaPrioramp;
   }
 
-  return(logpro);
+  return (logpro);
 }
 
-__device__ static inline void calProb(int iRefMap, int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, myfloat_t sumC, myfloat_t sumsquareC, myfloat_t value, int disx, int disy, bioem_Probability& pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
+__device__ static inline void
+calProb(int iRefMap, int iOrient, int iConv, const myfloat_t amp,
+        const myfloat_t pha, const myfloat_t env, const myfloat_t sumC,
+        const myfloat_t sumsquareC, myfloat_t value, int disx, int disy,
+        bioem_Probability &pProb, const bioem_param_device &param,
+        const bioem_RefMap &RefMap)
 {
   // IMPORTANT ROUTINE Summation of LogProb using FFTALGO
   // ********************************************************
   // *********** Calculates the BioEM probability ***********
   // ********************************************************
 
-  myfloat_t logpro = calc_logpro(param, amp, pha, env, sumC, sumsquareC, value, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
+  myfloat_t logpro =
+      calc_logpro(param, amp, pha, env, sumC, sumsquareC, value,
+                  RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
 
-  //GCC is too stupid to inline properly, so the code is copied here
-  //update_prob<-1>(logpro, iRefMap, iOrient, iConv, disx, disy, pProb, param.writeAngles);
-
-  bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
-
-  //	 printf("Separate PtotBef: %f Const: %f logProb %f %d %d %d \n",pProbMap.Total,pProbMap.Constoadd,logpro,iRefMap,iOrient,iConv);
-  if(pProbMap.Constoadd < logpro)
-    {
-      pProbMap.Total = pProbMap.Total * exp(-logpro + pProbMap.Constoadd);
-      pProbMap.Constoadd = logpro;
+#ifdef DEBUG_PROB
+  printf("\t\t\tProb: iRefMap %d, iOrient %d, iConv %d, "
+         "disx %d, disy %d, address -, value %f, logpro %f\n",
+         iRefMap, iOrient, iConv, disx, disy, value, logpro);
+#endif
 
-      // ********** Getting parameters that maximize the probability ***********
-      pProbMap.max.max_prob_cent_x = - disx;
-      pProbMap.max.max_prob_cent_y = - disy;
-      pProbMap.max.max_prob_orient = iOrient;
-      pProbMap.max.max_prob_conv = iConv;
-      pProbMap.max.max_prob_norm = - ( -sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value ) / ( sumC * sumC - sumsquareC *  param.Ntotpi);
-      pProbMap.max.max_prob_mu = - ( -sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap] ) / ( sumC * sumC - sumsquareC *  param.Ntotpi);
-    }
+  bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
+
+  if (pProbMap.Constoadd < logpro)
+  {
+    pProbMap.Total *= exp(-logpro + pProbMap.Constoadd);
+    pProbMap.Constoadd = logpro;
+
+    // ********** Getting parameters that maximize the probability ***********
+    pProbMap.max.max_prob_cent_x = -disx;
+    pProbMap.max.max_prob_cent_y = -disy;
+    pProbMap.max.max_prob_orient = iOrient;
+    pProbMap.max.max_prob_conv = iConv;
+    pProbMap.max.max_prob_norm =
+        -(-sumC * RefMap.sum_RefMap[iRefMap] + param.Ntotpi * value) /
+        (sumC * sumC - sumsquareC * param.Ntotpi);
+    pProbMap.max.max_prob_mu =
+        -(-sumC * value + sumsquareC * RefMap.sum_RefMap[iRefMap]) /
+        (sumC * sumC - sumsquareC * param.Ntotpi);
+
+#ifdef DEBUG_PROB
+    printf("\tProbabilities change: iRefMap %d, iOrient %d, iConv %d, Total "
+           "%f, Const %f, bestlogpro %f, sumExp -, bestId -\n",
+           iRefMap, iOrient, iConv, pProbMap.Total, pProbMap.Constoadd, logpro);
+    printf("\tParameters: iConv %d, myX -, myY -, disx %d, disy %d, probX "
+           "%d, probY %d\n",
+           iConv, disx, disy, pProbMap.max.max_prob_cent_x,
+           pProbMap.max.max_prob_cent_y);
+#endif
+  }
   pProbMap.Total += exp(logpro - pProbMap.Constoadd);
-
+#ifdef DEBUG_PROB
+  printf("\t\tProbabilities after Sum: iRefMap %d, iOrient %d, iConv %d, "
+         "Total %f, Const %f, bestlogpro %f, sumExp -, bestId -\n",
+         iRefMap, iOrient, iConv, pProbMap.Total, pProbMap.Constoadd, logpro);
+#endif
 
   if (param.writeAngles)
-    {
-      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
-      //	if(iOrient==1)printf("Separate Ptot: %f Const: %f logProb %f param: %d %d %d \n",logpro,pProbAngle.ConstAngle,pProbAngle.forAngles,disx,disx,iOrient);
-
-      if(pProbAngle.ConstAngle < logpro)
-	{
-	  pProbAngle.forAngles = pProbAngle.forAngles * exp(-logpro + pProbAngle.ConstAngle);
-	  pProbAngle.ConstAngle = logpro;
-	}
-      pProbAngle.forAngles += exp(logpro - pProbAngle.ConstAngle);
-      //    if(iOrient==5)printf("After separate Ptot: %f Const: %f logProb %f \n",logpro,pProbAngle.ConstAngle,pProbAngle.forAngles);
-    }      
+  {
+    bioem_Probability_angle &pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
 
-}
-
-__device__ static inline void doRefMapFFT(const int iRefMap, const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability& pProb, const bioem_param_device& param, const bioem_RefMap& RefMap)
-{
-  //******************** Using FFT algorithm **************************
-  //******************* Get cross-crollation of Ical to Iobs *******************
-  //*********** Routine to get the Cross-Corellation from lCC for the interested center displacement *************
-
-  for (int cent_x = 0; cent_x <= param.maxDisplaceCenter; cent_x = cent_x + param.GridSpaceCenter)
+    if (pProbAngle.ConstAngle < logpro)
     {
-      for (int cent_y = 0; cent_y <= param.maxDisplaceCenter; cent_y = cent_y + param.GridSpaceCenter)
-	{
-	  calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y, pProb, param, RefMap);
-	}
-      for (int cent_y = param.NumberPixels - param.maxDisplaceCenter; cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
-	{
-	  calProb(iRefMap, iOrient, iConv,amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x, cent_y - param.NumberPixels, pProb, param, RefMap);
-	}
+      pProbAngle.forAngles *= exp(-logpro + pProbAngle.ConstAngle);
+      pProbAngle.ConstAngle = logpro;
     }
 
-  for (int cent_x = param.NumberPixels - param.maxDisplaceCenter; cent_x < param.NumberPixels; cent_x = cent_x + param.GridSpaceCenter)
-    {
-      for (int cent_y = 0; cent_y < param.maxDisplaceCenter; cent_y = cent_y + param.GridSpaceCenter)
-	{
-	  calProb(iRefMap, iOrient, iConv,amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x - param.NumberPixels, cent_y, pProb, param, RefMap);
-	}
-      for (int cent_y = param.NumberPixels - param.maxDisplaceCenter ; cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
-	{
-	  calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels), cent_x - param.NumberPixels, cent_y - param.NumberPixels, pProb, param, RefMap);
-	}
-    }
-
-  //************ The following if is not in the manual***********
-  if (param.writeCC)
-    {
-      // If the Cross-correlation is to be written out and stored using Bayesian analysis
-      int  cc=0;
-      for (int cent_x = 0; cent_x < param.NumberPixels ; cent_x = cent_x + param.CCdisplace)
-	{
-	  for (int cent_y = 0; cent_y < param.NumberPixels ; cent_y = cent_y + param.CCdisplace)
-	    {
-
-
-	      bioem_Probability_cc& pProbCC = pProb.getProbCC(iRefMap, cc);
-
-	      myfloat_t ttmp,ttmp2;
-	      ttmp2 = (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] / (myfloat_t) (param.NumberPixels * param.NumberPixels);
-	
-	      if(not param.flipped){
-		//Here we are inverting the sign of the cross-correlation for the images that are not flipped
-		ttmp=-ttmp2;            }        
-	      else{           ttmp=ttmp2;             }
-
-	      if(!param.CCwithBayes){
-		// Storing Only the Maximum both for flipped and not flipped
-
-		if(pProbCC.forCC < ttmp) pProbCC.forCC = ttmp;
-
-	      }else {
-		// Storing the Cross-correlation with Bayesian formalism
-		if(pProbCC.ConstCC < ttmp)
-		  {
-		    pProbCC.forCC = pProbCC.forCC * exp(-ttmp + pProbCC.ConstCC);
-		    pProbCC.ConstCC = ttmp;
-		  }
-		pProbCC.forCC += exp(ttmp - pProbCC.ConstCC);
-       
-	      }
-	      //           printf("Separate %d %d Ptot: %f Const: %f logProb %f \n",cent_x,cent_y,pProbCC.forCC,pProbCC.ConstCC,ttmp);       
-	      cc++;
-
-	    }
-	}
-    }
-
-
+    pProbAngle.forAngles += exp(logpro - pProbAngle.ConstAngle);
+  }
 }
 
-template <int GPUAlgo, class RefT>
-  __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC, 
-						const myfloat_t sumsquareC, const myfloat_t* Mapconv, bioem_Probability& pProb, const bioem_param_device& param, const RefT& RefMap,
-					      const int cent_x, const int cent_y, const int myShift = 0, const int nShifts2 = 0, const int myRef = 0, const bool threadActive = true)
+__device__ static inline void
+doRefMapFFT(const int iRefMap, const int iOrient, const int iConv,
+            const myfloat_t amp, const myfloat_t pha, const myfloat_t env,
+            const myfloat_t sumC, const myfloat_t sumsquareC,
+            const myfloat_t *lCC, bioem_Probability &pProb,
+            const bioem_param_device &param, const bioem_RefMap &RefMap)
 {
-
-  //********************* Non FOURIER ALGORITHMS (refer to David) ***********
-  // **********************  Calculating BioEM Probability ********************************
-  // ************************* Loop of center displacement here ***************************
-
-  // Taking into account the center displacement
-
-  // Inizialzing crosscorrelations of calculated projected convolutions
-#ifdef SSECODE
-  myfloat_t sum, sumsquare, crossproMapConv;
-  __m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2;
-#else
-  myfloat_t sum = 0.0;
-  myfloat_t sumsquare = 0.0;
-  myfloat_t crossproMapConv = 0.0;
-#endif
-  // Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map
-  myfloat_t logpro;
-  if (GPUAlgo != 2 || threadActive)
+  //******************** Using FFT algorithm **************************
+  //******************* Get cross-crollation of Ical to Iobs *******************
+  //*********** Routine to get the Cross-Corellation from lCC for the interested
+  // center displacement *************
+
+  for (int cent_x = 0; cent_x <= param.maxDisplaceCenter;
+       cent_x = cent_x + param.GridSpaceCenter)
+  {
+    for (int cent_y = 0; cent_y <= param.maxDisplaceCenter;
+         cent_y = cent_y + param.GridSpaceCenter)
     {
-      int iStart, jStart, iEnd, jEnd;
-      if (cent_x < 0)
-	{
-	  iStart = -cent_x;
-	  iEnd = param.NumberPixels;
-	}
-      else
-	{
-	  iStart = 0;
-	  iEnd = param.NumberPixels - cent_x;
-	}
-      if (cent_y < 0)
-	{
-	  jStart = -cent_y;
-	  jEnd = param.NumberPixels;
-	}
-      else
-	{
-	  jStart = 0;
-	  jEnd = param.NumberPixels - cent_y;
-	}
-
-      for (int i = iStart; i < iEnd; i += 1)
-	{
-#ifdef SSECODE
-	  const float* ptr1 = &Mapconv.points[i + cent_x][jStart + cent_y];
-	  const float* ptr2 = RefMap.getp(iRefMap, i, jStart);
-	  int j;
-	  const int count = jEnd - jStart;
-	  for (j = 0; j <= count - 4; j += 4)
-	    {
-	      d1 = _mm_loadu_ps(ptr1);
-	      d2 = _mm_loadu_ps(ptr2);
-	      sum_v = _mm_add_ps(sum_v, d1);
-	      sumsquare_v = _mm_add_ps(sumsquare_v, _mm_mul_ps(d1, d1));
-	      cross_v = _mm_add_ps(cross_v, _mm_mul_ps(d1, d2));
-	      ptr1 += 4;
-	      ptr2 += 4;
-	    }
-#else
-	  for (int j = jStart; j < jEnd; j += 1)
-	    {
-	      const myfloat_t pointMap = Mapconv[(i + cent_x) * param.NumberPixels + j + cent_y];
-	      const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
-	      crossproMapConv += pointMap * pointRefMap;
-	      // Crosscorrelation of calculated displaced map
-	      sum += pointMap;
-	      // Calculate Sum of pixels squared
-	      sumsquare += pointMap * pointMap;
-	    }
-#endif
-	}
-#ifdef SSECODE
-      sum_v = _mm_hadd_ps(sum_v, sum_v);
-      sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
-      cross_v = _mm_hadd_ps(cross_v, cross_v);
-      sum_v = _mm_hadd_ps(sum_v, sum_v);
-      sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
-      cross_v = _mm_hadd_ps(cross_v, cross_v);
-      sum = _mm_cvtss_f32(sum_v);
-      sumsquare = _mm_cvtss_f32(sumsquare_v);
-      crossproMapConv = _mm_cvtss_f32(cross_v);
-#endif
-
-      // Calculating elements in BioEM Probability formula
-      logpro = calc_logpro(param, amp, pha, env, sum, sumsquare, crossproMapConv, RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
+      calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
+              (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
+                  (myfloat_t)(param.NumberPixels * param.NumberPixels),
+              cent_x, cent_y, pProb, param, RefMap);
     }
-  else
+    for (int cent_y = param.NumberPixels - param.maxDisplaceCenter;
+         cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
     {
-      logpro = 0;
+      calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
+              (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
+                  (myfloat_t)(param.NumberPixels * param.NumberPixels),
+              cent_x, cent_y - param.NumberPixels, pProb, param, RefMap);
     }
+  }
 
-#ifdef BIOEM_GPUCODE
-  if (GPUAlgo == 2)
-    {
-      extern __shared__ myfloat_t buf[];
-      myfloat_t* buf2 = &buf[myBlockDimX];
-      myfloat_t* buf3 = &buf2[myBlockDimX + 4 * myRef];
-      int* bufint = (int*) buf3;
-
-      buf[myThreadIdxX] = logpro;
-      if (myShift == 0)
-	{
-	  bufint[0] = 0;
-	}
-      __syncthreads();
-
-      if (nShifts2 == CUDA_MAX_SHIFT_REDUCE) // 1024
-	{
-	  if (myShift < 512) if (buf[myThreadIdxX + 512] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 512];
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 512)
-	{
-	  if (myShift < 256) if (buf[myThreadIdxX + 256] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 256];
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 256)
-	{
-	  if (myShift < 128) if (buf[myThreadIdxX + 128] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 128];
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 128)
-	{
-	  if (myShift < 64) if (buf[myThreadIdxX + 64] > buf[myThreadIdxX]) buf[myThreadIdxX] = buf[myThreadIdxX + 64];
-	  __syncthreads();
-	}
-
-      if (myShift < 32) //Warp Size is 32, threads are synched automatically
-	{
-	  volatile myfloat_t* vbuf = buf; //Mem must be volatile such that memory access is not reordered
-	  if (nShifts2 >= 64 && vbuf[myThreadIdxX + 32] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 32];
-	  if (nShifts2 >= 32 && vbuf[myThreadIdxX + 16] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 16];
-	  if (nShifts2 >= 16 && vbuf[myThreadIdxX + 8] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 8];
-	  if (nShifts2 >= 8 && vbuf[myThreadIdxX + 4] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 4];
-	  if (nShifts2 >= 4 && vbuf[myThreadIdxX + 2] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 2];
-	  if (nShifts2 >= 2 && vbuf[myThreadIdxX + 1] > vbuf[myThreadIdxX]) vbuf[myThreadIdxX] = vbuf[myThreadIdxX + 1];
-	  if (myShift == 0 && iRefMap < RefMap.ntotRefMap)
-	    {
-	      const myfloat_t logpro_max = vbuf[myThreadIdxX];
-	      update_prob<GPUAlgo>(logpro_max, iRefMap, iOrient, iConv, -1, -1, pProb, param.writeAngles, buf3, bufint);
-	    }
-	}
-
-      __syncthreads();
-
-      bioem_Probability_map& pProbMap = pProb.getProbMap(iRefMap);
-      bioem_Probability_angle& pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
-
-      if (bufint[0] == 1 && buf3[1] == logpro && iRefMap < RefMap.ntotRefMap && atomicAdd(&bufint[0], 1) == 1)
-	{
-	  pProbMap.max.max_prob_cent_x = - cent_x;
-	  pProbMap.max.max_prob_cent_y = - cent_y;
-	}
-
-      __syncthreads();
-
-      if (iRefMap < RefMap.ntotRefMap)
-	{
-	  buf[myThreadIdxX] = exp(logpro - pProbMap.Constoadd);
-	  buf2[myThreadIdxX] = exp(logpro - pProbAngle.ConstAngle);
-	}
-      __syncthreads();
-
-      if (nShifts2 == CUDA_MAX_SHIFT_REDUCE) // 1024
-	{
-	  if (myShift < 512)
-	    {
-	      buf[myThreadIdxX] += buf[myThreadIdxX + 512];
-	      buf2[myThreadIdxX] += buf2[myThreadIdxX + 512];
-	    }
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 512)
-	{
-	  if (myShift < 256)
-	    {
-	      buf[myThreadIdxX] += buf[myThreadIdxX + 256];
-	      buf2[myThreadIdxX] += buf2[myThreadIdxX + 256];
-	    }
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 256)
-	{
-	  if (myShift < 128)
-	    {
-	      buf[myThreadIdxX] += buf[myThreadIdxX + 128];
-	      buf2[myThreadIdxX] += buf2[myThreadIdxX + 128];
-	    }
-	  __syncthreads();
-	}
-
-      if (nShifts2 >= 128)
-	{
-	  if (myShift < 64)
-	    {
-	      buf[myThreadIdxX] += buf[myThreadIdxX + 64];
-	      buf2[myThreadIdxX] += buf2[myThreadIdxX + 64];
-	    }
-	  __syncthreads();
-	}
-
-      if (myShift < 32) //Warp Size is 32, threads are synched automatically
-	{
-	  volatile myfloat_t* vbuf = buf; //Mem must be volatile such that memory access is not reordered
-	  volatile myfloat_t* vbuf2 = buf2;
-	  if (nShifts2 >= 64)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 32];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 32];
-	    }
-	  if (nShifts2 >= 32)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 16];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 16];
-	    }
-	  if (nShifts2 >= 16)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 8];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 8];
-	    }
-	  if (nShifts2 >= 8)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 4];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 4];
-	    }
-	  if (nShifts2 >= 4)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 2];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 2];
-	    }
-	  if (nShifts2 >= 2)
-	    {
-	      vbuf[myThreadIdxX] += vbuf[myThreadIdxX + 1];
-	      vbuf2[myThreadIdxX] += vbuf2[myThreadIdxX + 1];
-	    }
-	  if (myShift == 0 && iRefMap < RefMap.ntotRefMap)
-	    {
-	      pProbMap.Total += vbuf[myThreadIdxX];
-	      pProbAngle.forAngles += vbuf2[myThreadIdxX];
-	    }
-	}
-    }
-  else
-#endif
+  for (int cent_x = param.NumberPixels - param.maxDisplaceCenter;
+       cent_x < param.NumberPixels; cent_x = cent_x + param.GridSpaceCenter)
+  {
+    for (int cent_y = 0; cent_y <= param.maxDisplaceCenter;
+         cent_y = cent_y + param.GridSpaceCenter)
     {
-      update_prob < -1 > (logpro, iRefMap, iOrient, iConv, cent_x, cent_y, pProb, param.writeAngles);
+      calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
+              (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
+                  (myfloat_t)(param.NumberPixels * param.NumberPixels),
+              cent_x - param.NumberPixels, cent_y, pProb, param, RefMap);
     }
-}
-
-template <int GPUAlgo, class RefT>
-  __device__ static inline void compareRefMapShifted(const int iRefMap, const int iOrient, const int iConv,  const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC, const myfloat_t sumsquareC, const myfloat_t* Mapconv, bioem_Probability& pProb, const bioem_param_device& param, const RefT& RefMap)
-{
-  for (int cent_x = -param.maxDisplaceCenter; cent_x <= param.maxDisplaceCenter; cent_x = cent_x + param.GridSpaceCenter)
+    for (int cent_y = param.NumberPixels - param.maxDisplaceCenter;
+         cent_y < param.NumberPixels; cent_y = cent_y + param.GridSpaceCenter)
     {
-      for (int cent_y = -param.maxDisplaceCenter; cent_y <= param.maxDisplaceCenter; cent_y = cent_y + param.GridSpaceCenter)
-	{
-	  compareRefMap<GPUAlgo>(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, Mapconv, pProb, param, RefMap, cent_x, cent_y);
-	}
+      calProb(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC,
+              (myfloat_t) lCC[cent_x * param.NumberPixels + cent_y] /
+                  (myfloat_t)(param.NumberPixels * param.NumberPixels),
+              cent_x - param.NumberPixels, cent_y - param.NumberPixels, pProb,
+              param, RefMap);
     }
+  }
 }
 
 #endif
diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index 71a5c97ee2f30633ad69d64f139fc5cac4064798..75b6ee72e4c6bc6285280e956b908ee2ac39869b 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -1,9 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -18,715 +22,1076 @@
 using namespace std;
 
 #include "bioem_cuda_internal.h"
-#include "bioem_algorithm.h"
 //#include "helper_cuda.h"
 
-#define checkCudaErrors(error) \
-{ \
-	if ((error) != cudaSuccess) \
-	{ \
-		printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \
-		exit(1); \
-	} \
-}
+#include "bioem_algorithm.h"
+
+#define checkCudaErrors(error)                                                 \
+  {                                                                            \
+    if ((error) != cudaSuccess)                                                \
+    {                                                                          \
+      printf("CUDA Error %d / %s (%s: %d)\n", error,                           \
+             cudaGetErrorString(error), __FILE__, __LINE__);                   \
+      exit(1);                                                                 \
+    }                                                                          \
+  }
+
+#ifdef DEBUG_GPU
+#define printCudaDebugStart()                                                  \
+  float time;                                                                  \
+  time = 0.;                                                                   \
+  cudaEvent_t start, stop;                                                     \
+  checkCudaErrors(cudaEventCreate(&start));                                    \
+  checkCudaErrors(cudaEventCreate(&stop));                                     \
+  checkCudaErrors(cudaEventRecord(start, 0));
+#define printCudaDebug(msg)                                                    \
+  checkCudaErrors(cudaEventRecord(stop, 0));                                   \
+  checkCudaErrors(cudaEventSynchronize(stop));                                 \
+  checkCudaErrors(cudaEventElapsedTime(&time, start, stop));                   \
+  printf("\t\t\tGPU: %s %1.6f sec\n", msg, time / 1000);                       \
+  checkCudaErrors(cudaEventRecord(start, 0));
+
+#else
+#define printCudaDebugStart()
+#define printCudaDebug(msg)
+#endif
 
 static const char *cufftGetErrorStrung(cufftResult error)
 {
-    switch (error)
-    {
-        case CUFFT_SUCCESS:
-            return "CUFFT_SUCCESS";
+  switch (error)
+  {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
 
-        case CUFFT_INVALID_PLAN:
-            return "CUFFT_INVALID_PLAN";
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
 
-        case CUFFT_ALLOC_FAILED:
-            return "CUFFT_ALLOC_FAILED";
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
 
-        case CUFFT_INVALID_TYPE:
-            return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
 
-        case CUFFT_INVALID_VALUE:
-            return "CUFFT_INVALID_VALUE";
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
 
-        case CUFFT_INTERNAL_ERROR:
-            return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
 
-        case CUFFT_EXEC_FAILED:
-            return "CUFFT_EXEC_FAILED";
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
 
-        case CUFFT_SETUP_FAILED:
-            return "CUFFT_SETUP_FAILED";
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
 
-        case CUFFT_INVALID_SIZE:
-            return "CUFFT_INVALID_SIZE";
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
 
-        case CUFFT_UNALIGNED_DATA:
-            return "CUFFT_UNALIGNED_DATA";
-    }
-    return "UNKNOWN";
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+  }
+  return "UNKNOWN";
 }
 
 /* Handing CUDA Driver errors */
 
-#define cuErrorCheck(call) \
-  do { \
-    CUresult __error__; \
-    if ((__error__ = (call)) != CUDA_SUCCESS) { \
-      printf("CUDA Driver Error %d / %s (%s %d)\n", __error__, cuGetError(__error__),__FILE__, __LINE__); \
-      return __error__; \
-    } \
+#define cuErrorCheck(call)                                                     \
+  do                                                                           \
+  {                                                                            \
+    CUresult __error__;                                                        \
+    if ((__error__ = (call)) != CUDA_SUCCESS)                                  \
+    {                                                                          \
+      printf("CUDA Driver Error %d / %s (%s %d)\n", __error__,                 \
+             cuGetError(__error__), __FILE__, __LINE__);                       \
+      return __error__;                                                        \
+    }                                                                          \
   } while (false)
 
-static const char * cuGetError(CUresult result) {
-  switch (result) {
-    case CUDA_SUCCESS:                              return "No errors";
-    case CUDA_ERROR_INVALID_VALUE:                  return "Invalid value";
-    case CUDA_ERROR_OUT_OF_MEMORY:                  return "Out of memory";
-    case CUDA_ERROR_NOT_INITIALIZED:                return "Driver not initialized";
-    case CUDA_ERROR_DEINITIALIZED:                  return "Driver deinitialized";
-    case CUDA_ERROR_PROFILER_DISABLED:              return "Profiler disabled";
-    case CUDA_ERROR_PROFILER_NOT_INITIALIZED:       return "Profiler not initialized";
-    case CUDA_ERROR_PROFILER_ALREADY_STARTED:       return "Profiler already started";
-    case CUDA_ERROR_PROFILER_ALREADY_STOPPED:       return "Profiler already stopped";
-    case CUDA_ERROR_NO_DEVICE:                      return "No CUDA-capable device available";
-    case CUDA_ERROR_INVALID_DEVICE:                 return "Invalid device";
-    case CUDA_ERROR_INVALID_IMAGE:                  return "Invalid kernel image";
-    case CUDA_ERROR_INVALID_CONTEXT:                return "Invalid context";
-    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:        return "Context already current";
-    case CUDA_ERROR_MAP_FAILED:                     return "Map failed";
-    case CUDA_ERROR_UNMAP_FAILED:                   return "Unmap failed";
-    case CUDA_ERROR_ARRAY_IS_MAPPED:                return "Array is mapped";
-    case CUDA_ERROR_ALREADY_MAPPED:                 return "Already mapped";
-    case CUDA_ERROR_NO_BINARY_FOR_GPU:              return "No binary for GPU";
-    case CUDA_ERROR_ALREADY_ACQUIRED:               return "Already acquired";
-    case CUDA_ERROR_NOT_MAPPED:                     return "Not mapped";
-    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:            return "Not mapped as array";
-    case CUDA_ERROR_NOT_MAPPED_AS_POINTER:          return "Not mapped as pointer";
-    case CUDA_ERROR_ECC_UNCORRECTABLE:              return "Uncorrectable ECC error";
-    case CUDA_ERROR_UNSUPPORTED_LIMIT:              return "Unsupported CUlimit";
-    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:         return "Context already in use";
-    case CUDA_ERROR_INVALID_SOURCE:                 return "Invalid source";
-    case CUDA_ERROR_FILE_NOT_FOUND:                 return "File not found";
-    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Shared object symbol not found";
-    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:      return "Shared object initialization failed";
-    case CUDA_ERROR_OPERATING_SYSTEM:               return "Operating System call failed";
-    case CUDA_ERROR_INVALID_HANDLE:                 return "Invalid handle";
-    case CUDA_ERROR_NOT_FOUND:                      return "Not found";
-    case CUDA_ERROR_NOT_READY:                      return "CUDA not ready";
-    case CUDA_ERROR_LAUNCH_FAILED:                  return "Launch failed";
-    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:        return "Launch exceeded resources";
-    case CUDA_ERROR_LAUNCH_TIMEOUT:                 return "Launch exceeded timeout";
-    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:  return "Launch with incompatible texturing";
-    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:    return "Peer access already enabled";
-    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:        return "Peer access not enabled";
-    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:         return "Primary context active";
-    case CUDA_ERROR_CONTEXT_IS_DESTROYED:           return "Context is destroyed";
-    case CUDA_ERROR_ASSERT:                         return "Device assert failed";
-    case CUDA_ERROR_TOO_MANY_PEERS:                 return "Too many peers";
-    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "Host memory already registered";
-    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:     return "Host memory not registered";
-    case CUDA_ERROR_UNKNOWN:                        return "Unknown error";
-    default:                                        return "Unknown error code";
+static const char *cuGetError(CUresult result)
+{
+  switch (result)
+  {
+    case CUDA_SUCCESS:
+      return "No errors";
+    case CUDA_ERROR_INVALID_VALUE:
+      return "Invalid value";
+    case CUDA_ERROR_OUT_OF_MEMORY:
+      return "Out of memory";
+    case CUDA_ERROR_NOT_INITIALIZED:
+      return "Driver not initialized";
+    case CUDA_ERROR_DEINITIALIZED:
+      return "Driver deinitialized";
+    case CUDA_ERROR_PROFILER_DISABLED:
+      return "Profiler disabled";
+    case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+      return "Profiler not initialized";
+    case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+      return "Profiler already started";
+    case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+      return "Profiler already stopped";
+    case CUDA_ERROR_NO_DEVICE:
+      return "No CUDA-capable device available";
+    case CUDA_ERROR_INVALID_DEVICE:
+      return "Invalid device";
+    case CUDA_ERROR_INVALID_IMAGE:
+      return "Invalid kernel image";
+    case CUDA_ERROR_INVALID_CONTEXT:
+      return "Invalid context";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+      return "Context already current";
+    case CUDA_ERROR_MAP_FAILED:
+      return "Map failed";
+    case CUDA_ERROR_UNMAP_FAILED:
+      return "Unmap failed";
+    case CUDA_ERROR_ARRAY_IS_MAPPED:
+      return "Array is mapped";
+    case CUDA_ERROR_ALREADY_MAPPED:
+      return "Already mapped";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU:
+      return "No binary for GPU";
+    case CUDA_ERROR_ALREADY_ACQUIRED:
+      return "Already acquired";
+    case CUDA_ERROR_NOT_MAPPED:
+      return "Not mapped";
+    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+      return "Not mapped as array";
+    case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+      return "Not mapped as pointer";
+    case CUDA_ERROR_ECC_UNCORRECTABLE:
+      return "Uncorrectable ECC error";
+    case CUDA_ERROR_UNSUPPORTED_LIMIT:
+      return "Unsupported CUlimit";
+    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+      return "Context already in use";
+    case CUDA_ERROR_INVALID_SOURCE:
+      return "Invalid source";
+    case CUDA_ERROR_FILE_NOT_FOUND:
+      return "File not found";
+    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+      return "Shared object symbol not found";
+    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+      return "Shared object initialization failed";
+    case CUDA_ERROR_OPERATING_SYSTEM:
+      return "Operating System call failed";
+    case CUDA_ERROR_INVALID_HANDLE:
+      return "Invalid handle";
+    case CUDA_ERROR_NOT_FOUND:
+      return "Not found";
+    case CUDA_ERROR_NOT_READY:
+      return "CUDA not ready";
+    case CUDA_ERROR_LAUNCH_FAILED:
+      return "Launch failed";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+      return "Launch exceeded resources";
+    case CUDA_ERROR_LAUNCH_TIMEOUT:
+      return "Launch exceeded timeout";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+      return "Launch with incompatible texturing";
+    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+      return "Peer access already enabled";
+    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+      return "Peer access not enabled";
+    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+      return "Primary context active";
+    case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+      return "Context is destroyed";
+    case CUDA_ERROR_ASSERT:
+      return "Device assert failed";
+    case CUDA_ERROR_TOO_MANY_PEERS:
+      return "Too many peers";
+    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+      return "Host memory already registered";
+    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+      return "Host memory not registered";
+    case CUDA_ERROR_UNKNOWN:
+      return "Unknown error";
+    default:
+      return "Unknown error code";
   }
 }
 
 bioem_cuda::bioem_cuda()
 {
-	deviceInitialized = 0;
-	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
-	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
-	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
-	if (GPUWorkload == -1) GPUWorkload = 100;
-	GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
+  deviceInitialized = 0;
+  GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
+  GPUWorkload =
+      getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
+  if (GPUWorkload == -1)
+    GPUWorkload = 100;
+  GPUDualStream =
+      getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
 }
 
-bioem_cuda::~bioem_cuda()
-{
-	deviceExit();
-}
+bioem_cuda::~bioem_cuda() { deviceExit(); }
 
-__global__ void compareRefMap_kernel(const int iOrient, const int iConv,  const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC,
-                                                const myfloat_t sumsquareC, const myfloat_t* pMap, bioem_Probability pProb, 
-						const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int cent_x, const int cent_y, const int maxRef)
+__global__ void multComplexMap(const mycomplex_t *convmap,
+                               const mycomplex_t *refmap, mycuComplex_t *out,
+                               const int MapSize, const int maxParallelConv,
+                               const int NumberRefMaps, const int Offset)
 {
-	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
-	if (iRefMap < maxRef)
-	{
-		compareRefMap<0>(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, pMap, pProb, param, RefMap, cent_x, cent_y);
-	}
+  int myConv = myBlockIdxX / NumberRefMaps;
+  int myRef = myBlockIdxX - myConv * NumberRefMaps + Offset;
+  const mycuComplex_t *myin = (mycuComplex_t *) &refmap[myRef * MapSize];
+  const mycuComplex_t *myconv = (mycuComplex_t *) &convmap[myConv * MapSize];
+  mycuComplex_t *myout = &out[myBlockIdxX * MapSize];
+  for (int i = myThreadIdxX; i < MapSize; i += myBlockDimX)
+  {
+    mycuComplex_t val;
+    const mycuComplex_t conv = myconv[i];
+    const mycuComplex_t in = myin[i];
+
+    val.x = conv.x * in.x + conv.y * in.y;
+    val.y = conv.y * in.x - conv.x * in.y;
+    myout[i] = val;
+  }
 }
 
-__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC, const myfloat_t sumsquareC, const myfloat_t* pMap, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int maxRef)
+__global__ void
+cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t *lCC,
+               const myparam5_t *comp_params, bioem_Probability pProb,
+               const bioem_param_device param, const bioem_RefMap RefMap,
+               const int maxRef, const int Offset)
 {
-	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
-	if (iRefMap < maxRef)
-	{
-		compareRefMapShifted<1>(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, pMap, pProb, param, RefMap);
-	}
+  if (myBlockIdxX * myBlockDimX + myThreadIdxX >= maxRef)
+    return;
+  const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
+  const myfloat_t *mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) *
+                                param.NumberPixels * param.NumberPixels];
+  doRefMapFFT(iRefMap, iOrient, iConv, comp_params->amp, comp_params->pha,
+              comp_params->env, comp_params->sumC, comp_params->sumsquareC,
+              mylCC, pProb, param, RefMap);
 }
 
-__global__ void cudaZeroMem(void* ptr, size_t size)
+__global__ void
+doRefMap_GPU_Parallel(const int iRefMap, const int iOrient, const int iConv,
+                      const int maxParallelConv, const myfloat_t *lCC,
+                      const myparam5_t *comp_params, myblockGPU_t *comp_block,
+                      bioem_Probability pProb, const bioem_param_device param,
+                      const bioem_RefMap RefMap, const int maxRef,
+                      const int dispC)
 {
-	int* myptr = (int*) ptr;
-	int mysize = size / sizeof(int);
-	int myid = myBlockDimX * myBlockIdxX + myThreadIdxX;
-	int mygrid = myBlockDimX * myGridDimX;
-	for (int i = myid; i < mysize; i += mygrid) myptr[i] = 0;
-}
+  int myGlobalId = myBlockIdxX * myBlockDimX + myThreadIdxX;
+  if (myGlobalId >= maxParallelConv * param.NtotDisp)
+    return;
+  int myConv = myGlobalId / param.NtotDisp;
+  myGlobalId -= myConv * param.NtotDisp;
+  int myX = myGlobalId / param.NxDisp;
+  myGlobalId -= myX * param.NxDisp;
+  int myY = myGlobalId;
+  myGlobalId = myBlockIdxX * myBlockDimX + myThreadIdxX;
+
+  int cent_x = (myX * param.GridSpaceCenter + dispC) % param.NumberPixels;
+  int cent_y = (myY * param.GridSpaceCenter + dispC) % param.NumberPixels;
+  int address = (myConv * maxRef * param.NumberPixels * param.NumberPixels) +
+                (cent_x * param.NumberPixels + cent_y);
+  myfloat_t value = (myfloat_t) lCC[address] /
+                    (myfloat_t)(param.NumberPixels * param.NumberPixels);
+
+  __shared__ myprob_t bestLogpro[CUDA_THREAD_MAX];
+  __shared__ int bestId[CUDA_THREAD_MAX];
+  __shared__ myprob_t sumExp[CUDA_THREAD_MAX];
+  __shared__ myprob_t sumAngles[CUDA_THREAD_MAX];
+
+  int nTotalThreads =
+      ((maxParallelConv * param.NtotDisp) < ((myBlockIdxX + 1) * myBlockDimX)) ?
+          ((maxParallelConv * param.NtotDisp) - (myBlockIdxX * myBlockDimX)) :
+          myBlockDimX;
+  int halfPoint = (nTotalThreads + 1) >> 1; // divide by two
+
+  bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
+
+  bestLogpro[myThreadIdxX] =
+      calc_logpro(param, comp_params[myConv].amp, comp_params[myConv].pha,
+                  comp_params[myConv].env, comp_params[myConv].sumC,
+                  comp_params[myConv].sumsquareC, value,
+                  RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
+#ifdef DEBUG_PROB
+  printf("\t\t\tProb: iRefMap %d, iOrient %d, iConv %d, "
+         "cent_x %d, cent_y %d, address %d, value %f, logpro %f\n",
+         iRefMap, iOrient, iConv, cent_x, cent_y, address, value,
+         bestLogpro[myThreadIdxX]);
+#endif
+  bestId[myThreadIdxX] = myGlobalId;
+  sumExp[myThreadIdxX] = exp(bestLogpro[myThreadIdxX] - pProbMap.Constoadd);
+  if (param.writeAngles)
+  {
+    bioem_Probability_angle &pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
+    sumAngles[myThreadIdxX] =
+        exp(bestLogpro[myThreadIdxX] - pProbAngle.ConstAngle);
+  }
+  __syncthreads();
 
-__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t sumC, const myfloat_t sumsquareC, const myfloat_t* pMap, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
-{
-	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
-	const int iRefMap = myid >> (nShiftBits << 1);
-	const int myRef = myThreadIdxX >> (nShiftBits << 1);
-	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
-	const int myShiftIdy = myid & (nShifts - 1);
-	const int myShift = myid & (nShifts * nShifts - 1);
-	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
-	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
-
-	const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
-
-	compareRefMap<2>(iRefMap, iOrient, iConv, amp, pha, env, sumC, sumsquareC, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
+  // Total number of active threads
+  while (nTotalThreads > 1)
+  {
+    if (myThreadIdxX < (nTotalThreads >> 1))
+    {
+      // Get the shared value stored by another thread
+      myprob_t temp = bestLogpro[myThreadIdxX + halfPoint];
+      if (temp > bestLogpro[myThreadIdxX])
+      {
+        bestLogpro[myThreadIdxX] = temp;
+        bestId[myThreadIdxX] = bestId[myThreadIdxX + halfPoint];
+      }
+      sumExp[myThreadIdxX] += sumExp[myThreadIdxX + halfPoint];
+      if (param.writeAngles)
+      {
+        sumAngles[myThreadIdxX] += sumAngles[myThreadIdxX + halfPoint];
+      }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint;            // divide by two.
+    halfPoint = (nTotalThreads + 1) >> 1; // divide by two
+    // only the first half of the threads will be active.
+  }
+  if (myThreadIdxX == 0)
+  {
+    comp_block[myBlockIdxX].logpro = bestLogpro[0];
+    comp_block[myBlockIdxX].id = bestId[0];
+    comp_block[myBlockIdxX].sumExp = sumExp[0];
+    if (param.writeAngles)
+    {
+      comp_block[myBlockIdxX].sumAngles = sumAngles[0];
+    }
+#ifdef DEBUG_PROB
+    printf("\t\t\tProb block: iRefMap %d, iOrient %d, iConv %d, "
+           "bestlogpro %f, bestId %d, sumExp %f\n",
+           iRefMap, iOrient, iConv, bestLogpro[0], bestId[0], sumExp[0]);
+#endif
+  }
 }
 
-__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset)
+__global__ void
+doRefMap_GPU_Reduce(const int iRefMap, const int iOrient, const int iConv,
+                    const int maxParallelConv, const myfloat_t *lCC,
+                    const myparam5_t *comp_params,
+                    const myblockGPU_t *comp_block, bioem_Probability pProb,
+                    const bioem_param_device param, const bioem_RefMap RefMap,
+                    const int maxRef, const int dispC)
 {
-	if (myBlockIdxX >= NumberMaps) return;
-	const mycuComplex_t* myin = (mycuComplex_t*) &refmap[(myBlockIdxX + Offset) * MapSize];
-	const mycuComplex_t* myconv = (mycuComplex_t*) convmap;
-	mycuComplex_t* myout = &out[myBlockIdxX * MapSize];
-	for(int i = myThreadIdxX; i < NumberPixelsTotal; i += myBlockDimX)
-	{
-		mycuComplex_t val;
-		const mycuComplex_t conv = myconv[i];
-		const mycuComplex_t in = myin[i];
-
-		val.x = conv.x * in.x + conv.y * in.y;
-		val.y = conv.y * in.x - conv.x * in.y;
-		myout[i] = val;
-	}
+
+  __shared__ myprob_t bestLogpro[CUDA_THREAD_MAX];
+  __shared__ int bestId[CUDA_THREAD_MAX];
+  __shared__ myprob_t sumExp[CUDA_THREAD_MAX];
+  __shared__ myprob_t sumAngles[CUDA_THREAD_MAX];
+
+  // if it is the last block
+  int nTotalThreads = myBlockDimX;
+  int halfPoint = (nTotalThreads + 1) >> 1; // divide by two
+
+  bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
+
+  bestLogpro[myThreadIdxX] = comp_block[myThreadIdxX].logpro;
+  bestId[myThreadIdxX] = comp_block[myThreadIdxX].id;
+  sumExp[myThreadIdxX] = comp_block[myThreadIdxX].sumExp;
+  if (param.writeAngles)
+  {
+    sumAngles[myThreadIdxX] = comp_block[myThreadIdxX].sumAngles;
+  }
+  __syncthreads();
+  while (nTotalThreads > 1)
+  {
+    if (myThreadIdxX < (nTotalThreads >> 1))
+    {
+      // Get the shared value stored by another thread
+      myfloat_t temp = bestLogpro[myThreadIdxX + halfPoint];
+      if (temp > bestLogpro[myThreadIdxX])
+      {
+        bestLogpro[myThreadIdxX] = temp;
+        bestId[myThreadIdxX] = bestId[myThreadIdxX + halfPoint];
+      }
+      sumExp[myThreadIdxX] += sumExp[myThreadIdxX + halfPoint];
+      if (param.writeAngles)
+      {
+        sumAngles[myThreadIdxX] += sumAngles[myThreadIdxX + halfPoint];
+      }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint;            // divide by two.
+    halfPoint = (nTotalThreads + 1) >> 1; // divide by two
+    // only the first half of the threads will be active.
+  }
+
+  if (myThreadIdxX == 0)
+  {
+    pProbMap.Total += sumExp[0];
+    if (pProbMap.Constoadd < bestLogpro[0])
+    {
+      pProbMap.Total *= exp(-bestLogpro[0] + pProbMap.Constoadd);
+      pProbMap.Constoadd = bestLogpro[0];
+
+      // ********** Getting parameters that maximize the probability ***********
+      int myGlobalId = bestId[0];
+      int myConv = myGlobalId / param.NtotDisp;
+      myGlobalId -= myConv * param.NtotDisp;
+      int myX = myGlobalId / param.NxDisp;
+      myGlobalId -= myX * param.NxDisp;
+      int myY = myGlobalId;
+
+      int cent_x = (myX * param.GridSpaceCenter + dispC) % param.NumberPixels;
+      int cent_y = (myY * param.GridSpaceCenter + dispC) % param.NumberPixels;
+      int address =
+          (myConv * maxRef * param.NumberPixels * param.NumberPixels) +
+          (cent_x * param.NumberPixels + cent_y);
+      myfloat_t value = (myfloat_t) lCC[address] /
+                        (myfloat_t)(param.NumberPixels * param.NumberPixels);
+
+      pProbMap.max.max_prob_cent_x =
+          -((myX * param.GridSpaceCenter + dispC) - param.NumberPixels);
+      pProbMap.max.max_prob_cent_y =
+          -((myY * param.GridSpaceCenter + dispC) - param.NumberPixels);
+      pProbMap.max.max_prob_orient = iOrient;
+      pProbMap.max.max_prob_conv = iConv + myConv;
+      pProbMap.max.max_prob_norm =
+          -(-comp_params[myConv].sumC * RefMap.sum_RefMap[iRefMap] +
+            param.Ntotpi * value) /
+          (comp_params[myConv].sumC * comp_params[myConv].sumC -
+           comp_params[myConv].sumsquareC * param.Ntotpi);
+      pProbMap.max.max_prob_mu =
+          -(-comp_params[myConv].sumC * value +
+            comp_params[myConv].sumsquareC * RefMap.sum_RefMap[iRefMap]) /
+          (comp_params[myConv].sumC * comp_params[myConv].sumC -
+           comp_params[myConv].sumsquareC * param.Ntotpi);
+
+#ifdef DEBUG_PROB
+      printf("\tProbabilities change: iRefMap %d, iOrient %d, iConv %d, "
+             "Total %f, Const %f, bestlogpro %f, sumExp %f, bestId %d\n",
+             iRefMap, iOrient, iConv + myConv, pProbMap.Total,
+             pProbMap.Constoadd, bestLogpro[0], sumExp[0], bestId[0]);
+      printf("\tParameters: myConv %d, myX %d, myY %d, cent_x %d, cent_y %d, "
+             "probX %d, probY %d\n",
+             myConv, myX, myY, cent_x, cent_y, pProbMap.max.max_prob_cent_x,
+             pProbMap.max.max_prob_cent_y);
+#endif
+    }
+#ifdef DEBUG_PROB
+    printf("\t\tProbabilities after Reduce: iRefMap %d, iOrient %d, iConv "
+           "%d, Total %f, Const %f, bestlogpro %f, sumExp %f, bestId %d\n",
+           iRefMap, iOrient, iConv, pProbMap.Total, pProbMap.Constoadd,
+           bestLogpro[0], sumExp[0], bestId[0]);
+#endif
+
+    if (param.writeAngles)
+    {
+      bioem_Probability_angle &pProbAngle =
+          pProb.getProbAngle(iRefMap, iOrient);
+      pProbAngle.forAngles += sumAngles[0];
+      if (pProbAngle.ConstAngle < bestLogpro[0])
+      {
+        pProbAngle.forAngles *= exp(-bestLogpro[0] + pProbAngle.ConstAngle);
+        pProbAngle.ConstAngle = bestLogpro[0];
+      }
+    }
+  }
 }
 
-__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t amp, const myfloat_t pha, const myfloat_t env, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset)
+__global__ void
+init_Constoadd(const int iRefMap, const int iOrient, const myfloat_t *lCC,
+               const myparam5_t *comp_params, bioem_Probability pProb,
+               const bioem_param_device param, const bioem_RefMap RefMap,
+               const int initialized_const)
 {
-	if (myBlockIdxX * myBlockDimX + myThreadIdxX >= maxRef) return;
-	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
-	const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels];
-	doRefMapFFT(iRefMap, iOrient, iConv, amp, pha, env, mylCC, sumC, sumsquareC, pProb, param, RefMap);
+  myfloat_t value =
+      (myfloat_t) lCC[0] / (myfloat_t)(param.NumberPixels * param.NumberPixels);
+
+  myfloat_t logpro =
+      calc_logpro(param, comp_params->amp, comp_params->pha, comp_params->env,
+                  comp_params->sumC, comp_params->sumsquareC, value,
+                  RefMap.sum_RefMap[iRefMap], RefMap.sumsquare_RefMap[iRefMap]);
+
+  bioem_Probability_map &pProbMap = pProb.getProbMap(iRefMap);
+
+  // Needed only once, in the first projection
+  if (!initialized_const)
+  {
+    pProbMap.Constoadd = logpro;
+  }
+  // Needed for every projection
+  if (param.writeAngles)
+  {
+    bioem_Probability_angle &pProbAngle = pProb.getProbAngle(iRefMap, iOrient);
+    pProbAngle.ConstAngle = logpro;
+  }
+
+#ifdef DEBUG_GPU
+  printf("\tInitialized pProbMap.Constoadd of refmap %d to %f\n", iRefMap,
+         pProbMap.Constoadd);
+#endif
 }
 
-template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
-static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
-#if defined(_WIN32)
-static inline int ilog2 (int value)
+template <class T> static inline T divup(T num, T divider)
 {
-	DWORD index;
-	_BitScanReverse (&index, value);
-	return(value);
+  return ((num + divider - 1) / divider);
 }
-#else
-static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
-#endif
 
-int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
+int bioem_cuda::compareRefMaps(int iPipeline, int iOrient, int iConv,
+                               int maxParallelConv, mycomplex_t *conv_mapsFFT,
+                               myparam5_t *comp_params, const int startMap)
 {
-	if (startMap)
-	{
-		cout << "Error startMap not implemented for GPU Code\n";
-		exit(1);
-	}
-#ifdef DEBUG_GPU
-	float time;
-	cudaEvent_t start, stop;
-	checkCudaErrors(cudaEventCreate(&start));
-	checkCudaErrors(cudaEventCreate(&stop));
-	checkCudaErrors(cudaEventRecord(start, 0));
-#endif
-	if (GPUAsync)
-	{
-		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
-	}
-#ifdef DEBUG_GPU
-	checkCudaErrors(cudaEventRecord(stop, 0));
-	checkCudaErrors(cudaEventSynchronize(stop));
-	checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
-	printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
-	checkCudaErrors(cudaEventRecord(start, 0));
-#endif
-	if (FFTAlgo)
-	{
-		memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
-		checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
-#ifdef DEBUG_GPU
-		checkCudaErrors(cudaEventRecord(stop, 0));
-		checkCudaErrors(cudaEventSynchronize(stop));
-		checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
-		printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
-		checkCudaErrors(cudaEventRecord(start, 0));
-#endif
-		if (GPUAsync)
-		{
-			checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
-			checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0));
-		}
-		if (GPUDualStream)
-		{
-			checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
-			checkCudaErrors(cudaStreamWaitEvent(cudaStream[1], cudaFFTEvent[0], 0));
-		}
-		for (int i = 0, j = 0; i < maxRef; i += CUDA_FFTS_AT_ONCE, j++)
-		{
-			if (!GPUDualStream) j = 0;
-			const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
-			multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream[j & 1]>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2[j & 1], param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i);
-			cufftResult err = mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1][j & 1] : plan[0][j & 1], pFFTtmp2[j & 1], pFFTtmp[j & 1]);
-			if (err != CUFFT_SUCCESS)
-			{
-				cout << "Error running CUFFT " << cufftGetErrorStrung(err) << "\n";
-				exit(1);
-			}
-			cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[j & 1]>>>(iOrient, iConv,  amp, pha, env, pFFTtmp[j & 1], sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
-		}
-		checkCudaErrors(cudaPeekAtLastError());
-		if (GPUDualStream)
-		{
-			checkCudaErrors(cudaEventRecord(cudaFFTEvent[1], cudaStream[1]));
-			checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaFFTEvent[1], 0));
-		}
-	}
-	else
-	{
-		checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
-#ifdef DEBUG_GPU
-		checkCudaErrors(cudaEventRecord(stop, 0));
-		checkCudaErrors(cudaEventSynchronize(stop));
-		checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
-		printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
-		checkCudaErrors(cudaEventRecord(start, 0) );
-#endif
-		if (GPUAlgo == 2) //Loop over shifts
-		{
-			const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
-			if (!IsPowerOf2(nShifts))
-			{
-				cout << "Invalid number of displacements, no power of two\n";
-				exit(1);
-			}
-			if (CUDA_THREAD_COUNT % (nShifts * nShifts))
-			{
-				cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
-				exit(1);
-			}
-			if (nShifts > CUDA_MAX_SHIFT_REDUCE)
-			{
-				cout << "Too many displacements for CUDA reduction\n";
-				exit(1);
-			}
-			const int nShiftBits = ilog2(nShifts);
-			size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
-			size_t nBlocks = CUDA_BLOCK_COUNT;
-			for (size_t i = 0; i < totalBlocks; i += nBlocks)
-			{
-				compareRefMapLoopShifts_kernel<<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream[0] >>> (iOrient, iConv, amp, pha, env, sumC, sumsquareC, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef);
-			}
-		}
-		else if (GPUAlgo == 1) //Split shifts in multiple kernels
-		{
-			for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x = cent_x + param.param_device.GridSpaceCenter)
-			{
-				for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y = cent_y + param.param_device.GridSpaceCenter)
-				{
-					compareRefMap_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[0]>>> (iOrient, iConv, amp, pha, env, sumC, sumsquareC, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, cent_x, cent_y, maxRef);
-				}
-			}
-		}
-		else if (GPUAlgo == 0) //All shifts in one kernel
-		{
-			compareRefMapShifted_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[0]>>> (iOrient, iConv, amp, pha, env, sumC, sumsquareC, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, maxRef);
-		}
-		else
-		{
-			cout << "Invalid GPU Algorithm selected\n";
-			exit(1);
-		}
-	}
-#ifdef DEBUG_GPU
-	checkCudaErrors(cudaEventRecord(stop, 0));
-	checkCudaErrors(cudaEventSynchronize(stop));
-	checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
-	printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
-	checkCudaErrors(cudaEventRecord(start, 0));
-#endif
-	if (GPUWorkload < 100)
-	{
-		bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
-	}
-#ifdef DEBUG_GPU
-	checkCudaErrors(cudaEventRecord(stop, 0));
-	checkCudaErrors(cudaEventSynchronize(stop));
-	checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
-	printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
-#endif
-	if (GPUAsync)
-	{
-		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
-	}
-	else
-	{
-		checkCudaErrors(cudaStreamSynchronize(cudaStream[0]));
-	}
-	return(0);
+  if (startMap)
+  {
+    cout << "Error startMap not implemented for GPU Code\n";
+    exit(1);
+  }
+  printCudaDebugStart();
+  if (GPUAsync)
+  {
+    checkCudaErrors(cudaEventSynchronize(cudaEvent[iPipeline & 1]));
+    printCudaDebug("time to synch projections");
+  }
+
+  int k = (iPipeline & 1) * param.nTotParallelConv;
+  memcpy(&pConvMapFFT_Host[k * param.FFTMapSize],
+         conv_mapsFFT[k * param.FFTMapSize],
+         param.FFTMapSize * maxParallelConv * sizeof(mycomplex_t));
+  printCudaDebug("time for memcpy");
+  checkCudaErrors(
+      cudaMemcpyAsync(&pConvMapFFT[k * param.FFTMapSize],
+                      &pConvMapFFT_Host[k * param.FFTMapSize],
+                      param.FFTMapSize * maxParallelConv * sizeof(mycomplex_t),
+                      cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
+  // If one wants just a single tranfer, without memcpy:
+  // checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[k * param.FFTMapSize],
+  // conv_mapsFFT[k * param.FFTMapSize], param.FFTMapSize * maxParallelConv *
+  // sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 :
+  // 0]));
+  checkCudaErrors(cudaMemcpyAsync(&pTmp_comp_params[k], &comp_params[k],
+                                  maxParallelConv * sizeof(myparam5_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream[GPUAsync ? 2 : 0]));
+  printCudaDebug("time for asyncmemcpy");
+  if (GPUAsync)
+  {
+    checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
+    checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaEvent[2], 0));
+  }
+  if (GPUDualStream)
+  {
+    checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
+    checkCudaErrors(cudaStreamWaitEvent(cudaStream[1], cudaFFTEvent[0], 0));
+  }
+  for (int offset = 0, stream = 0; offset < maxRef;
+       offset += param.nTotParallelMaps, stream++)
+  {
+    if (!GPUDualStream)
+      stream = 0;
+    const int nRef = min(param.nTotParallelMaps, maxRef - offset);
+    multComplexMap<<<maxParallelConv * nRef, CudaThreadCount, 0,
+                     cudaStream[stream & 1]>>>(
+        &pConvMapFFT[k * param.FFTMapSize], pRefMapsFFT, pFFTtmp2[stream & 1],
+        param.FFTMapSize, maxParallelConv, nRef, offset);
+    printCudaDebug("time for multComplexMap kernel");
+    cufftResult err = mycufftExecC2R(offset + param.nTotParallelMaps > maxRef ?
+                                         plan[1][stream & 1] :
+                                         plan[0][stream & 1],
+                                     pFFTtmp2[stream & 1], pFFTtmp[stream & 1]);
+    if (err != CUFFT_SUCCESS)
+    {
+      cout << "Error running CUFFT " << cufftGetErrorStrung(err) << "\n";
+      exit(1);
+    }
+    printCudaDebug("time for mycufftExecC2R kernel");
+    if (BioEMAlgo == 1)
+    {
+      for (int conv = 0; conv < maxParallelConv; conv++)
+      {
+        cuDoRefMapsFFT<<<divup(nRef, CudaThreadCount), CudaThreadCount, 0,
+                         cudaStream[stream & 1]>>>(
+            iOrient, iConv + conv,
+            pFFTtmp[stream & 1] +
+                conv * nRef * param.param_device.NumberPixels *
+                    param.param_device.NumberPixels,
+            &pTmp_comp_params[k + conv], pProb_device, param.param_device,
+            *gpumap, nRef, offset);
+        printCudaDebug("time for cuDoRefMapsFFT kernel");
+      }
+    }
+    else
+    {
+      for (int refmap = offset; refmap < nRef + offset; refmap++)
+      {
+        // First iteration needs to initialize Constoadd with the first valid
+        // value to avoid overflow due to high sumExp values
+        if ((initialized_const[refmap] == false) ||
+            (param.param_device.writeAngles && iConv == 0))
+        {
+          init_Constoadd<<<1, 1, 0, cudaStream[stream & 1]>>>(
+              refmap, iOrient,
+              pFFTtmp[stream & 1] +
+                  (refmap - offset) * param.param_device.NumberPixels *
+                      param.param_device.NumberPixels,
+              &pTmp_comp_params[k], pProb_device, param.param_device, *gpumap,
+              (int) initialized_const[refmap]);
+          initialized_const[refmap] = true;
+          printCudaDebug("time for init_Constoadd kernel");
+        }
+
+        doRefMap_GPU_Parallel<<<divup(maxParallelConv *
+                                          param.param_device.NtotDisp,
+                                      CudaThreadCount),
+                                CudaThreadCount, 0, cudaStream[stream & 1]>>>(
+            refmap, iOrient, iConv, maxParallelConv,
+            pFFTtmp[stream & 1] +
+                (refmap - offset) * param.param_device.NumberPixels *
+                    param.param_device.NumberPixels,
+            &pTmp_comp_params[k], &pTmp_comp_blocks[refmap * Ncomp_blocks],
+            pProb_device, param.param_device, *gpumap, nRef,
+            param.param_device.NumberPixels -
+                param.param_device.maxDisplaceCenter);
+        printCudaDebug("time for doRefMaps_GPU_Parallel kernel");
+
+        doRefMap_GPU_Reduce<<<1, divup(maxParallelConv *
+                                           param.param_device.NtotDisp,
+                                       CudaThreadCount),
+                              0, cudaStream[stream & 1]>>>(
+            refmap, iOrient, iConv, maxParallelConv,
+            pFFTtmp[stream & 1] +
+                (refmap - offset) * param.param_device.NumberPixels *
+                    param.param_device.NumberPixels,
+            &pTmp_comp_params[k], &pTmp_comp_blocks[refmap * Ncomp_blocks],
+            pProb_device, param.param_device, *gpumap, nRef,
+            param.param_device.NumberPixels -
+                param.param_device.maxDisplaceCenter);
+        printCudaDebug("time for doRefMaps_GPU_Reduce kernel");
+      }
+    }
+  }
+  checkCudaErrors(cudaPeekAtLastError());
+
+  if (GPUDualStream)
+  {
+    checkCudaErrors(cudaEventRecord(cudaFFTEvent[1], cudaStream[1]));
+    checkCudaErrors(cudaStreamWaitEvent(cudaStream[0], cudaFFTEvent[1], 0));
+  }
+
+  if ((BioEMAlgo == 1) && (GPUWorkload < 100))
+  {
+    bioem::compareRefMaps(iPipeline, iOrient, iConv, maxParallelConv,
+                          conv_mapsFFT, comp_params, maxRef);
+    printCudaDebug("time to run OMP");
+  }
+  if (GPUAsync)
+  {
+    checkCudaErrors(cudaEventRecord(cudaEvent[iPipeline & 1], cudaStream[0]));
+  }
+  else
+  {
+    checkCudaErrors(cudaStreamSynchronize(cudaStream[0]));
+    printCudaDebug("time to synch at the end");
+  }
+  return (0);
 }
 
 int bioem_cuda::selectCudaDevice()
 {
-	int count;
-	int bestDevice = 0;
-	cudaDeviceProp deviceProp;
-
-	/* Initializing CUDA driver API */
-	cuErrorCheck(cuInit(0));
-
-	/* Get number of available CUDA devices */
-	checkCudaErrors(cudaGetDeviceCount(&count));
-	if (count == 0)
-	{
-		printf("No CUDA device detected\n");
-		return(1);
-	}
-
-	/* Find the best GPU */
-	long long int bestDeviceSpeed = -1, deviceSpeed = -1;
-	for (int i = 0; i < count; i++)
-	{
-		cudaGetDeviceProperties(&deviceProp, i);
-		deviceSpeed = (long long int) deviceProp.multiProcessorCount * (long long int) deviceProp.clockRate * (long long int) deviceProp.warpSize;
-		if (deviceSpeed > bestDeviceSpeed)
-		{
-			bestDevice = i;
-			bestDeviceSpeed = deviceSpeed;
-		}
-	}
-
-	/* Get user-specified GPU choice */
-	if (getenv("GPUDEVICE"))
-	{
-		int device = atoi(getenv("GPUDEVICE"));
-		if (device > count)
-		{
-			printf("Invalid CUDA device specified, max device number is %d\n", count);
-			exit(1);
-		}
+  int count;
+  int bestDevice = 0;
+  cudaDeviceProp deviceProp;
+
+  /* Initializing CUDA driver API */
+  cuErrorCheck(cuInit(0));
+
+  /* Get number of available CUDA devices */
+  checkCudaErrors(cudaGetDeviceCount(&count));
+  if (count == 0)
+  {
+    printf("No CUDA device detected\n");
+    return (1);
+  }
+
+  /* Find the best GPU */
+  long long int bestDeviceSpeed = -1, deviceSpeed = -1;
+  for (int i = 0; i < count; i++)
+  {
+    cudaGetDeviceProperties(&deviceProp, i);
+    deviceSpeed = (long long int) deviceProp.multiProcessorCount *
+                  (long long int) deviceProp.clockRate *
+                  (long long int) deviceProp.warpSize;
+    if (deviceSpeed > bestDeviceSpeed)
+    {
+      bestDevice = i;
+      bestDeviceSpeed = deviceSpeed;
+    }
+  }
+
+  /* Get user-specified GPU choice */
+  if (getenv("GPUDEVICE"))
+  {
+    int device = atoi(getenv("GPUDEVICE"));
+    if (device > count)
+    {
+      printf("Invalid CUDA device specified, max device number is %d\n", count);
+      exit(1);
+    }
 #ifdef WITH_MPI
-		if (device == -1)
-		{
-			device = mpi_rank % count;
-		}
+    if (device == -1)
+    {
+      device = mpi_rank % count;
+    }
 #endif
-		if (device < 0)
-		{
-			printf("Negative CUDA device specified: %d, invalid!\n", device);
-			exit(1);
-		}
-		bestDevice = device;
-	}
-
-	/* Set CUDA processes to appropriate devices */
-	cudaGetDeviceProperties(&deviceProp, bestDevice);
-	if (deviceProp.computeMode == 0)
-	{
-		checkCudaErrors(cudaSetDevice(bestDevice));
-	}
-	else
-	{
-		if (DebugOutput >= 1)
-		{
-			printf("CUDA device %d is not set in DEFAULT mode, make sure that CUDA processes are pinned as planned!\n", bestDevice);
-			printf("Pinning process %d to CUDA device %d\n", mpi_rank, bestDevice);
-		}
-		checkCudaErrors(cudaSetDevice(bestDevice));
-		/* This synchronization is needed in order to detect bogus silent errors from cudaSetDevice call */
-		checkCudaErrors(cudaDeviceSynchronize());
-	}
-
-	/* Debugging information about CUDA devices used by the current process */
-	if (DebugOutput >= 3)
-	{
-		printf("Using CUDA Device %s with Properties:\n", deviceProp.name);
-		printf("totalGlobalMem = %lld\n", (unsigned long long int) deviceProp.totalGlobalMem);
-		printf("sharedMemPerBlock = %lld\n", (unsigned long long int) deviceProp.sharedMemPerBlock);
-		printf("regsPerBlock = %d\n", deviceProp.regsPerBlock);
-		printf("warpSize = %d\n", deviceProp.warpSize);
-		printf("memPitch = %lld\n", (unsigned long long int) deviceProp.memPitch);
-		printf("maxThreadsPerBlock = %d\n", deviceProp.maxThreadsPerBlock);
-		printf("maxThreadsDim = %d %d %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
-		printf("maxGridSize = %d %d %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
-		printf("totalConstMem = %lld\n", (unsigned long long int) deviceProp.totalConstMem);
-		printf("major = %d\n", deviceProp.major);
-		printf("minor = %d\n", deviceProp.minor);
-		printf("clockRate = %d\n", deviceProp.clockRate);
-		printf("memoryClockRate = %d\n", deviceProp.memoryClockRate);
-		printf("multiProcessorCount = %d\n", deviceProp.multiProcessorCount);
-		printf("textureAlignment = %lld\n", (unsigned long long int) deviceProp.textureAlignment);
-		printf("computeMode = %d\n", deviceProp.computeMode);
+    if (device < 0)
+    {
+      printf("Negative CUDA device specified: %d, invalid!\n", device);
+      exit(1);
+    }
+    bestDevice = device;
+  }
+
+  /* Set CUDA processes to appropriate devices */
+  cudaGetDeviceProperties(&deviceProp, bestDevice);
+  if (deviceProp.computeMode == 0)
+  {
+    checkCudaErrors(cudaSetDevice(bestDevice));
+  }
+  else
+  {
+    if (DebugOutput >= 1)
+    {
+      printf("CUDA device %d is not set in DEFAULT mode, make sure that CUDA "
+             "processes are pinned as planned!\n",
+             bestDevice);
+      printf("Pinning process %d to CUDA device %d\n", mpi_rank, bestDevice);
+    }
+    checkCudaErrors(cudaSetDevice(bestDevice));
+    /* This synchronization is needed in order to detect bogus silent errors
+     * from cudaSetDevice call */
+    checkCudaErrors(cudaDeviceSynchronize());
+  }
+
+  /* Debugging information about CUDA devices used by the current process */
+  if (DebugOutput >= 2)
+  {
+    printf("Using CUDA Device %s with Properties:\n", deviceProp.name);
+    printf("totalGlobalMem = %lld\n",
+           (unsigned long long int) deviceProp.totalGlobalMem);
+    printf("sharedMemPerBlock = %lld\n",
+           (unsigned long long int) deviceProp.sharedMemPerBlock);
+    printf("regsPerBlock = %d\n", deviceProp.regsPerBlock);
+    printf("warpSize = %d\n", deviceProp.warpSize);
+    printf("memPitch = %lld\n", (unsigned long long int) deviceProp.memPitch);
+    printf("maxThreadsPerBlock = %d\n", deviceProp.maxThreadsPerBlock);
+    printf("maxThreadsDim = %d %d %d\n", deviceProp.maxThreadsDim[0],
+           deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
+    printf("maxGridSize = %d %d %d\n", deviceProp.maxGridSize[0],
+           deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
+    printf("totalConstMem = %lld\n",
+           (unsigned long long int) deviceProp.totalConstMem);
+    printf("major = %d\n", deviceProp.major);
+    printf("minor = %d\n", deviceProp.minor);
+    printf("clockRate = %d\n", deviceProp.clockRate);
+    printf("memoryClockRate = %d\n", deviceProp.memoryClockRate);
+    printf("multiProcessorCount = %d\n", deviceProp.multiProcessorCount);
+    printf("textureAlignment = %lld\n",
+           (unsigned long long int) deviceProp.textureAlignment);
+    printf("computeMode = %d\n", deviceProp.computeMode);
 #if CUDA_VERSION > 3010
-		size_t free, total;
+    size_t free, total;
 #else
-		unsigned int free, total;
+    unsigned int free, total;
 #endif
-		if (deviceProp.computeMode == 0)
-		{
-			CUdevice tmpDevice;
-			cuErrorCheck(cuDeviceGet(&tmpDevice, bestDevice));
-			CUcontext tmpContext;
-			cuErrorCheck(cuCtxCreate(&tmpContext, 0, tmpDevice));
-			cuErrorCheck(cuMemGetInfo(&free, &total));
-			cuErrorCheck(cuCtxDestroy(tmpContext));
-		}
-		else
-		{
-			cuErrorCheck(cuMemGetInfo(&free, &total));
-		}
-		printf("free memory = %lld; total memory = %lld\n", free, total);
-	}
-
-	if (DebugOutput >= 1)
-	{
-		printf("BioEM for CUDA initialized (MPI Rank %d), %d GPUs found, using GPU %d\n", mpi_rank, count, bestDevice);
-	}
-
-	return(0);
+    if (deviceProp.computeMode == 0)
+    {
+      CUdevice tmpDevice;
+      cuErrorCheck(cuDeviceGet(&tmpDevice, bestDevice));
+      CUcontext tmpContext;
+      cuErrorCheck(cuCtxCreate(&tmpContext, 0, tmpDevice));
+      cuErrorCheck(cuMemGetInfo(&free, &total));
+      cuErrorCheck(cuCtxDestroy(tmpContext));
+    }
+    else
+    {
+      cuErrorCheck(cuMemGetInfo(&free, &total));
+    }
+    printf("free memory = %lld; total memory = %lld\n", free, total);
+  }
+
+  if (DebugOutput >= 1)
+  {
+    printf("BioEM for CUDA initialized (MPI Rank %d), %d GPUs found, using GPU "
+           "%d\n",
+           mpi_rank, count, bestDevice);
+  }
+
+  return (0);
 }
 
 int bioem_cuda::deviceInit()
 {
-	deviceExit();
-	
-	selectCudaDevice();
-
-	if (FFTAlgo) GPUAlgo = 2;
-
-	gpumap = new bioem_RefMap;
-	memcpy(gpumap, &RefMap, sizeof(bioem_RefMap));
-	if (FFTAlgo == 0)
-	{
-		checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize));
-
-		if (GPUAlgo == 0 || GPUAlgo == 1)
-		{
-			pRefMap_device_Mod = (bioem_RefMap_Mod*) gpumap;
-			bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod;
-			RefMapGPU->init(RefMap);
-			checkCudaErrors(cudaMemcpy(maps, RefMapGPU->maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice));
-			delete RefMapGPU;
-		}
-		else
-		{
-			checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice));
-		}
-	}
-	checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap));
-	checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
-	checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap));
-	checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
-	gpumap->maps = maps;
-	gpumap->sum_RefMap = sum;
-	gpumap->sumsquare_RefMap = sumsquare;
-
-	checkCudaErrors(cudaMalloc(&pProb_memory, pProb_device.get_size(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, param.param_device.writeAngles, param.param_device.writeCC)));
-
-	for (int i = 0; i < 2; i++)
-	{
-		checkCudaErrors(cudaStreamCreate(&cudaStream[i]));
-		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
-		checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
-		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
-	}
-	if (GPUAsync)
-	{
-		checkCudaErrors(cudaStreamCreate(&cudaStream[2]));
-		checkCudaErrors(cudaEventCreate(&cudaEvent[2]));
-	}
-
-	if (FFTAlgo)
-	{
-		checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t)));
-		checkCudaErrors(cudaMalloc(&pFFTtmp2[0], CUDA_FFTS_AT_ONCE * param.FFTMapSize * 2 * sizeof(mycomplex_t)));
-		checkCudaErrors(cudaMalloc(&pFFTtmp[0], CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * 2 * sizeof(myfloat_t)));
-		pFFTtmp2[1] = pFFTtmp2[0] + CUDA_FFTS_AT_ONCE * param.FFTMapSize;
-		pFFTtmp[1] = pFFTtmp[0] + CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels;
-		checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2));
-		checkCudaErrors(cudaHostAlloc(&pConvMapFFT_Host, param.FFTMapSize * sizeof(mycomplex_t) * 2, 0));
-		checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice));
-	}
-
-	deviceInitialized = 1;
-	return(0);
+  deviceExit();
+
+  selectCudaDevice();
+
+  gpumap = new bioem_RefMap;
+  memcpy(gpumap, &RefMap, sizeof(bioem_RefMap));
+
+  checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap));
+  checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap,
+                             sizeof(myfloat_t) * RefMap.ntotRefMap,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap));
+  checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap,
+                             sizeof(myfloat_t) * RefMap.ntotRefMap,
+                             cudaMemcpyHostToDevice));
+  gpumap->sum_RefMap = sum;
+  gpumap->sumsquare_RefMap = sumsquare;
+
+  checkCudaErrors(
+      cudaMalloc(&pProb_memory,
+                 pProb_device.get_size(RefMap.ntotRefMap, param.nTotGridAngles,
+                                       param.param_device.writeAngles)));
+
+  for (int i = 0; i < PIPELINE_LVL; i++)
+  {
+    checkCudaErrors(cudaStreamCreate(&cudaStream[i]));
+    checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
+  }
+  for (int i = 0; i < MULTISTREAM_LVL; i++)
+  {
+    checkCudaErrors(cudaEventCreate(&cudaFFTEvent[i]));
+  }
+  if (GPUAsync)
+  {
+    checkCudaErrors(cudaStreamCreate(&cudaStream[2]));
+    checkCudaErrors(cudaEventCreate(&cudaEvent[2]));
+  }
+
+  checkCudaErrors(
+      cudaMalloc(&pRefMapsFFT,
+                 RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t)));
+  checkCudaErrors(
+      cudaMalloc(&pFFTtmp2[0], param.nTotParallelConv * param.nTotParallelMaps *
+                                   param.FFTMapSize * MULTISTREAM_LVL *
+                                   sizeof(mycomplex_t)));
+  checkCudaErrors(
+      cudaMalloc(&pFFTtmp[0], param.nTotParallelConv * param.nTotParallelMaps *
+                                  param.param_device.NumberPixels *
+                                  param.param_device.NumberPixels *
+                                  MULTISTREAM_LVL * sizeof(myfloat_t)));
+  for (int i = 1; i < MULTISTREAM_LVL; i++)
+  {
+    pFFTtmp2[i] =
+        pFFTtmp2[0] +
+        i * param.nTotParallelConv * param.nTotParallelMaps * param.FFTMapSize;
+    pFFTtmp[i] = pFFTtmp[0] +
+                 i * param.nTotParallelConv * param.nTotParallelMaps *
+                     param.param_device.NumberPixels *
+                     param.param_device.NumberPixels;
+  }
+  checkCudaErrors(cudaMalloc(&pConvMapFFT, param.nTotParallelConv *
+                                               param.FFTMapSize * PIPELINE_LVL *
+                                               sizeof(mycomplex_t)));
+  checkCudaErrors(cudaHostAlloc(&pConvMapFFT_Host,
+                                param.nTotParallelConv * param.FFTMapSize *
+                                    PIPELINE_LVL * sizeof(mycomplex_t),
+                                0));
+  checkCudaErrors(
+      cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT,
+                 RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t),
+                 cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMalloc(&pTmp_comp_params,
+                 param.nTotParallelConv * PIPELINE_LVL * sizeof(myparam5_t)));
+  Ncomp_blocks = divup(param.nTotParallelConv * param.param_device.NtotDisp,
+                       CudaThreadCount);
+  if (Ncomp_blocks > CudaThreadCount)
+  {
+    cout << "Error with input parameters. Check CudaThreadCount, "
+            "displacements and max number of parallel comparisons\n";
+    exit(1);
+  }
+  checkCudaErrors(
+      cudaMalloc(&pTmp_comp_blocks,
+                 Ncomp_blocks * RefMap.ntotRefMap * sizeof(myblockGPU_t)));
+
+  initialized_const = new bool[RefMap.ntotRefMap];
+  for (int i = 0; i < RefMap.ntotRefMap; i++)
+    initialized_const[i] = false;
+
+  deviceInitialized = 1;
+  return (0);
 }
 
 int bioem_cuda::deviceExit()
 {
-	if (deviceInitialized == 0) return(0);
-
-
-	cudaFree(pProb_memory);
-	cudaFree(sum);
-	cudaFree(sumsquare);
-	for (int i = 0; i < 2; i++)
-	{
-		cudaStreamDestroy(cudaStream[i]);
-		cudaEventDestroy(cudaEvent[i]);
-		cudaEventDestroy(cudaFFTEvent[i]);
-		cudaFree(pConvMap_device[i]);
-	}
-	if (FFTAlgo)
-	{
-		cudaFree(pRefMapsFFT);
-		cudaFree(pConvMapFFT);
-		cudaFreeHost(pConvMapFFT_Host);
-		cudaFree(pFFTtmp[0]);
-		cudaFree(pFFTtmp2[0]);
-	}
-	else
-	{
-		cudaFree(maps);
-	}
-	if (GPUAlgo == 0 || GPUAlgo == 1)
-	{
-		cudaFree(pRefMap_device_Mod);
-	}
-	if (GPUAsync)
-	{
-		cudaStreamDestroy(cudaStream[2]);
-		cudaEventDestroy(cudaEvent[2]);
-	}
-
-	delete gpumap;
-	cudaThreadExit();
-
-	deviceInitialized = 0;
-	return(0);
+  if (deviceInitialized == 0)
+    return (0);
+
+  cudaFree(pProb_memory);
+  cudaFree(sum);
+  cudaFree(sumsquare);
+  for (int i = 0; i < PIPELINE_LVL; i++)
+  {
+    cudaStreamDestroy(cudaStream[i]);
+    cudaEventDestroy(cudaEvent[i]);
+  }
+  for (int i = 0; i < MULTISTREAM_LVL; i++)
+  {
+    cudaEventDestroy(cudaFFTEvent[i]);
+  }
+
+  cudaFree(pRefMapsFFT);
+  cudaFree(pConvMapFFT);
+  cudaFreeHost(pConvMapFFT_Host);
+  cudaFree(pFFTtmp[0]);
+  cudaFree(pFFTtmp2[0]);
+  cudaFree(pTmp_comp_params);
+  cudaFree(pTmp_comp_blocks);
+
+  if (GPUAsync)
+  {
+    cudaStreamDestroy(cudaStream[2]);
+    cudaEventDestroy(cudaEvent[2]);
+  }
+
+  delete gpumap;
+  delete initialized_const;
+  cudaThreadExit();
+
+  deviceInitialized = 0;
+  return (0);
 }
 
 int bioem_cuda::deviceStartRun()
 {
-	if (GPUWorkload >= 100)
-	{
-		maxRef = RefMap.ntotRefMap;
-		pProb_host = &pProb;
-	}
-	else
-	{
-		maxRef = RefMap.ntotRefMap == 1 ? (size_t) RefMap.ntotRefMap : (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
-		pProb_host = new bioem_Probability;
-		pProb_host->init(maxRef, param.nTotGridAngles, param.nTotCC, *this);
-		pProb_host->copyFrom(&pProb, *this);
-	}
-
-	pProb_device = *pProb_host;
-	pProb_device.ptr = pProb_memory;
-	pProb_device.set_pointers();
-	checkCudaErrors(cudaMemcpyAsync(pProb_device.ptr, pProb_host->ptr, pProb_host->get_size(maxRef, param.nTotGridAngles, param.nTotCC, param.param_device.writeAngles, param.param_device.writeCC), cudaMemcpyHostToDevice, cudaStream[0]));
-
-	if (FFTAlgo)
-	{
-		for (int j = 0;j < 2;j++)
-		{
-			for (int i = 0; i < 2; i++)
-			{
-				if (i && maxRef % CUDA_FFTS_AT_ONCE == 0) continue;
-				int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
-				if (cufftPlanMany(&plan[i][j], 2, n, NULL, 1, param.FFTMapSize, NULL, 1, 0, MY_CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
-				{
-					cout << "Error planning CUFFT\n";
-					exit(1);
-				}
-			        if (cufftSetCompatibilityMode(plan[i][j], CUFFT_COMPATIBILITY_FFTW_PADDING) != CUFFT_SUCCESS)
-				{
-					cout << "Error planning CUFFT compatibility\n";
-					exit(1);
-				}
-				if (cufftSetStream(plan[i][j], cudaStream[j]) != CUFFT_SUCCESS)
-				{
-					cout << "Error setting CUFFT stream\n";
-					exit(1);
-				}
-			}
-			if (!GPUDualStream) break;
-		}
-	}
-	return(0);
+  if (GPUWorkload >= 100)
+  {
+    maxRef = RefMap.ntotRefMap;
+    pProb_host = &pProb;
+  }
+  else
+  {
+    maxRef = ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100) < 1 ?
+                 (size_t) RefMap.ntotRefMap :
+                 (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
+    pProb_host = new bioem_Probability;
+    pProb_host->init(maxRef, param.nTotGridAngles, *this);
+    pProb_host->copyFrom(&pProb, *this);
+  }
+
+  pProb_device = *pProb_host;
+  pProb_device.ptr = pProb_memory;
+  pProb_device.set_pointers();
+  checkCudaErrors(
+      cudaMemcpyAsync(pProb_device.ptr, pProb_host->ptr,
+                      pProb_host->get_size(maxRef, param.nTotGridAngles,
+                                           param.param_device.writeAngles),
+                      cudaMemcpyHostToDevice, cudaStream[0]));
+
+  if (maxRef / (param.nTotParallelMaps * param.nTotParallelConv) >
+      (double) SPLIT_MAPS_LVL)
+  {
+    cout << "Error planning CUFFT dimensions\n";
+    exit(1);
+  }
+  for (int j = 0; j < MULTISTREAM_LVL; j++)
+  {
+    for (int i = 0; i < SPLIT_MAPS_LVL; i++)
+    {
+      if (i && maxRef % param.nTotParallelMaps == 0)
+        continue;
+      int n[2] = {param.param_device.NumberPixels,
+                  param.param_device.NumberPixels};
+      if (cufftPlanMany(
+              &plan[i][j], 2, n, NULL, 1, param.FFTMapSize, NULL, 1, 0,
+              MY_CUFFT_C2R,
+              i ? ((maxRef % param.nTotParallelMaps) * param.nTotParallelConv) :
+                  (param.nTotParallelMaps * param.nTotParallelConv)) !=
+          CUFFT_SUCCESS)
+      {
+        cout << "Error planning CUFFT\n";
+        exit(1);
+      }
+      if (cufftSetCompatibilityMode(
+              plan[i][j], CUFFT_COMPATIBILITY_FFTW_PADDING) != CUFFT_SUCCESS)
+      {
+        cout << "Error planning CUFFT compatibility\n";
+        exit(1);
+      }
+      if (cufftSetStream(plan[i][j], cudaStream[j]) != CUFFT_SUCCESS)
+      {
+        cout << "Error setting CUFFT stream\n";
+        exit(1);
+      }
+    }
+    if (!GPUDualStream)
+      break;
+  }
+
+  return (0);
 }
 
 int bioem_cuda::deviceFinishRun()
 {
-	if (GPUAsync) cudaStreamSynchronize(cudaStream[0]);
-	checkCudaErrors(cudaMemcpyAsync(pProb_host->ptr, pProb_device.ptr, pProb_host->get_size(maxRef, param.nTotGridAngles, param.nTotCC, param.param_device.writeAngles, param.param_device.writeCC), cudaMemcpyDeviceToHost, cudaStream[0]));
-
-	if (FFTAlgo)
-	{
-		for (int j = 0;j < 2;j++)
-		{
-			for (int i = 0; i < 2; i++)
-			{
-				if (i && maxRef % CUDA_FFTS_AT_ONCE == 0) continue;
-				cufftDestroy(plan[i][j]);
-			}
-			if (!GPUDualStream) break;
-		}
-	}
-	cudaThreadSynchronize();
-	if (GPUWorkload < 100)
-	{
-		pProb.copyFrom(pProb_host, *this);
-		free_device_host(pProb_host->ptr);
-		delete[] pProb_host;
-	}
-
-	return(0);
-}
+  if (GPUAsync)
+    cudaStreamSynchronize(cudaStream[0]);
+  checkCudaErrors(
+      cudaMemcpyAsync(pProb_host->ptr, pProb_device.ptr,
+                      pProb_host->get_size(maxRef, param.nTotGridAngles,
+                                           param.param_device.writeAngles),
+                      cudaMemcpyDeviceToHost, cudaStream[0]));
+
+  for (int j = 0; j < MULTISTREAM_LVL; j++)
+  {
+    for (int i = 0; i < SPLIT_MAPS_LVL; i++)
+    {
+      if (i && maxRef % param.nTotParallelMaps == 0)
+        continue;
+      cufftDestroy(plan[i][j]);
+    }
+    if (!GPUDualStream)
+      break;
+  }
 
-void* bioem_cuda::malloc_device_host(size_t size)
-{
-	void* ptr;
-	checkCudaErrors(cudaHostAlloc(&ptr, size, 0));
-	return(ptr);
+  cudaThreadSynchronize();
+  if (GPUWorkload < 100)
+  {
+    pProb.copyFrom(pProb_host, *this);
+    free_device_host(pProb_host->ptr);
+    delete[] pProb_host;
+  }
+
+  return (0);
 }
 
-void bioem_cuda::free_device_host(void* ptr)
+void *bioem_cuda::malloc_device_host(size_t size)
 {
-	cudaFreeHost(ptr);
+  void *ptr;
+  checkCudaErrors(cudaHostAlloc(&ptr, size, 0));
+  return (ptr);
 }
 
+void bioem_cuda::free_device_host(void *ptr) { cudaFreeHost(ptr); }
+
 void bioem_cuda::rebalance(int workload)
 {
-	if ((workload < 0) || (workload > 100) || (workload == GPUWorkload)) return;
+  if ((workload < 0) || (workload > 100) || (workload == GPUWorkload))
+    return;
 
-	deviceFinishRun();
+  deviceFinishRun();
 
-	if (DebugOutput >= 2)
-	{
-	  printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank);
-	}
+  if (DebugOutput >= 2)
+  {
+    printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank);
+  }
 
-	GPUWorkload = workload;
-	maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
+  GPUWorkload = workload;
+  maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
 
-	deviceStartRun();
+  deviceStartRun();
 }
 
-bioem* bioem_cuda_create()
+bioem *bioem_cuda_create()
 {
-	int count;
-	
-	if (cudaGetDeviceCount(&count) != cudaSuccess) count = 0;
-	if (count == 0)
-	{
-		printf("No CUDA device available, using fallback to CPU version\n");
-		return new bioem;
-	}
-
-	return new bioem_cuda;
+  int count;
+
+  if (cudaGetDeviceCount(&count) != cudaSuccess)
+    count = 0;
+  if (count == 0)
+  {
+    printf("No CUDA device available, using fallback to CPU version\n");
+    return new bioem;
+  }
+
+  return new bioem_cuda;
 }
diff --git a/include/autotuner.h b/include/autotuner.h
index 10db9ca8d21810f883d4d3bbf74dd9895a9e1498..64ad3ae071a6133da6153233f38e0a0a8166d767 100644
--- a/include/autotuner.h
+++ b/include/autotuner.h
@@ -1,9 +1,11 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, Markus Rampp, Luka Stanisic and Gerhard
+   Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -11,52 +13,58 @@
 #ifndef AUTOTUNER_H
 #define AUTOTUNER_H
 
-class Autotuner {
+class Autotuner
+{
 
 public:
-	Autotuner() {stopTuning = true;}
+  Autotuner() { stopTuning = true; }
 
-	/* Setting variables to initial values */
-	inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
+  /* Setting variables to initial values */
+  inline void Initialize(int alg = 3, int st = 7)
+  {
+    algo = alg;
+    stable = st;
+    Reset();
+  }
 
-	/* Resetting variables to initial values */
-	void Reset();
+  /* Resetting variables to initial values */
+  void Reset();
 
-	/* Check if autotuning is needed, depending on which comparison is finished */
-	bool Needed(int iteration);
+  /* Check if autotuning is needed, depending on which comparison is finished */
+  bool Needed(int iteration);
 
-	/* Check if optimal workload value has been computed */
-	bool Finished();
+  /* Check if optimal workload value has been computed */
+  bool Finished();
 
-	/* Set a new workload value to test, depending on the algorithm */
-	void Tune(double compTime);
+  /* Set a new workload value to test, depending on the algorithm */
+  void Tune(double compTime);
 
-	/* Return workload value */
-	inline int Workload() {return workload;}
+  /* Return workload value */
+  inline int Workload() { return workload; }
 
 private:
-	int algo;
-	int stable;
-
-	bool stopTuning;
-	int workload;
-
-	/* Variables needed for AlgoSimple and AlgoRatio */
-	double best_time;
-	int best_workload;
-
-	/* Variables needed for AlgoBisection */
-	int a;
-	int b;
-	int c;
-	int x;
-	int limit;
-	double fb, fx;
-
-	/* Autotuning algorithms */
-	void AlgoSimple(double compTime);
-	void AlgoRatio(double compTime);
-	void AlgoBisection(double compTime);
+  int algo;
+  int stable;
+
+  bool stopTuning;
+  int workload;
+
+  /* Variables needed for AlgoSimple and AlgoRatio */
+  double best_time;
+  int best_workload;
+
+  /* Variables needed for AlgoBisection */
+  int a;
+  int b;
+  int c;
+  int x;
+  int limit;
+  double fb, fx;
+
+  /* Autotuning algorithms */
+  void AlgoSimple(double compTime);
+  void AlgoRatio(double compTime);
+  void AlgoBisection(double compTime);
 };
 
 #endif
diff --git a/include/bioem.h b/include/bioem.h
index 813acec26c2bcaeaa78459402d1f8eec0d8997c1..8aa3f3faa1201923fc640483a30304dc2d398d0a 100644
--- a/include/bioem.h
+++ b/include/bioem.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -14,66 +15,87 @@
 #ifndef BIOEM_H
 #define BIOEM_H
 
-#include "defs.h"
 #include "bioem.h"
-#include "model.h"
+#include "defs.h"
 #include "map.h"
+#include "model.h"
 #include "param.h"
 
 class bioem
 {
-	friend class bioem_RefMap;
-	friend class bioem_Probability;
+  friend class bioem_RefMap;
+  friend class bioem_Probability;
 
 public:
-	bioem();
-	virtual ~bioem();
-
-	int configure(int ac, char* av[]);
-	void cleanup(); //Cleanup everything happening during configure
-
-	int precalculate(); // Is it better to pass directly the input File names?
-	int dopreCalCrossCorrelation(int iRefMap, int iRefMapLocal);
-	int run();
-	int doProjections(int iMap);
-	int createConvolutedProjectionMap(int iOreint, int iMap, mycomplex_t* lproj, myfloat_t* Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC);
-
-	virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
-
-	virtual void* malloc_device_host(size_t size);
-	virtual void free_device_host(void* ptr);
-	virtual void rebalance(int workload); //Rebalance GPUWorkload
-	void rebalanceWrapper(int workload); //Rebalance wrapper
-
-	int createProjection(int iMap, mycomplex_t* map);
-	int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
-	void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT, mycomplex_t* localCCT, myfloat_t* lCC);
-
-	bioem_Probability pProb;
-
-        string OutfileName;
-	bool yesoutfilename;
-
+  bioem();
+  virtual ~bioem();
+
+  void printOptions(myoption_t *myoptions, int myoptions_length);
+  int readOptions(int ac, char *av[]);
+  int configure(int ac, char *av[]);
+  void cleanup(); // Cleanup everything happening during configure
+
+  int precalculate(); // Is it better to pass directly the input File names?
+  inline int needToPrintModel() { return param.printModel; }
+  int printModel();
+  int run();
+  int doProjections(int iMap);
+  int createConvolutedProjectionMap(int iOreint, int iMap, mycomplex_t *lproj,
+                                    mycomplex_t *localmultFFT, myfloat_t &sumC,
+                                    myfloat_t &sumsquareC);
+  int createConvolutedProjectionMap_noFFT(mycomplex_t *lproj,
+                                          myfloat_t *Mapconv,
+                                          mycomplex_t *localmultFFT,
+                                          myfloat_t &sumC,
+                                          myfloat_t &sumsquareC);
+
+  virtual int compareRefMaps(int iPipeline, int iOrient, int iConv,
+                             int maxParallelConv, mycomplex_t *localmultFFT,
+                             myparam5_t *comp_params, const int startMap = 0);
+
+  virtual void *malloc_device_host(size_t size);
+  virtual void free_device_host(void *ptr);
+  virtual void rebalance(int workload); // Rebalance GPUWorkload
+  void rebalanceWrapper(int workload);  // Rebalance wrapper
+
+  int createProjection(int iMap, mycomplex_t *map);
+  int calcross_cor(myfloat_t *localmap, myfloat_t &sum, myfloat_t &sumsquare);
+  void calculateCCFFT(int iMap, mycomplex_t *localConvFFT,
+                      mycomplex_t *localCCT, myfloat_t *lCC);
+  void doRefMap_CPU_Parallel(int iRefMap, int iOrient, int iConv,
+                             myfloat_t *lCC, myparam5_t *comp_params,
+                             myblockCPU_t *comp_block);
+  void doRefMap_CPU_Reduce(int iRefMap, int iOrient, int iConvStart,
+                           int maxParallelConv, myparam5_t *comp_params,
+                           myblockCPU_t *comp_block);
+
+  bioem_Probability pProb;
+
+  string OutfileName;
 
 protected:
-	virtual int deviceInit();
-	virtual int deviceStartRun();
-	virtual int deviceFinishRun();
-
-	bioem_param param;
-	bioem_model Model;
-	bioem_RefMap RefMap;
-
-	int nReferenceMaps;			//Maps in memory at a time
-	int nReferenceMapsTotal;	//Maps in total
-
-	int nProjectionMaps;		//Maps in memory at a time
-	int nProjectionMapsTotal;	//Maps in total
-
-	int FFTAlgo;				//Use the FFT Algorithm (Default 1)
-	int DebugOutput;			//Debug Output Level (Default 2)
-	int nProjectionsAtOnce;		//Number of projections to do at once via OpenMP (Default 1)
-	bool Autotuning;				//Do the autotuning of the load-balancing between CPUs and GPUs
+  virtual int deviceInit();
+  virtual int deviceStartRun();
+  virtual int deviceFinishRun();
+
+  bioem_param param;
+  bioem_model Model;
+  bioem_RefMap RefMap;
+
+  int nReferenceMaps;      // Maps in memory at a time
+  int nReferenceMapsTotal; // Maps in total
+
+  int nProjectionMaps;      // Maps in memory at a time
+  int nProjectionMapsTotal; // Maps in total
+
+  int BioEMAlgo;          // BioEM algorithm used to do comparison (Default 1)
+  int CudaThreadCount;    // Number of CUDA threads used in each block (Default
+                          // depends on the BioEM algorithm)
+  int DebugOutput;        // Debug Output Level (Default 0)
+  int nProjectionsAtOnce; // Number of projections to do at once via OpenMP
+                          // (Default number of OMP threads)
+  bool Autotuning; // Do the autotuning of the load-balancing between CPUs and
+  // GPUs (Default 1, if GPUs are used and GPUWORKLOAD is not specified)
 };
 
 #endif
diff --git a/include/bioem_cuda.h b/include/bioem_cuda.h
index 99add42be1576028c88d7f66b209656de7797041..6a4552a11c6beeab13e4b1e53f10aae5678b144f 100644
--- a/include/bioem_cuda.h
+++ b/include/bioem_cuda.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
+   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
         Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -16,6 +17,6 @@
 
 #include "bioem.h"
 
-extern bioem* bioem_cuda_create();
+extern bioem *bioem_cuda_create();
 
 #endif
diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h
index 708b40fb3e9a2a10b265965baaa224244159b975..1b269a393bc28b5e79f69fa6211f770c62f02e1e 100644
--- a/include/bioem_cuda_internal.h
+++ b/include/bioem_cuda_internal.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -17,7 +18,7 @@
 #include <cuda.h>
 #include <cufft.h>
 
-//Hack to make nvcc compiler accept fftw.h, float128 is not used anyway
+// Hack to make nvcc compiler accept fftw.h, float128 is not used anyway
 #define __float128 double
 #include <fftw3.h>
 #undef __float128
@@ -27,51 +28,60 @@
 class bioem_cuda : public bioem
 {
 public:
-	bioem_cuda();
-	virtual ~bioem_cuda();
+  bioem_cuda();
+  virtual ~bioem_cuda();
 
-	virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
-	virtual void* malloc_device_host(size_t size);
-	virtual void free_device_host(void* ptr);
-	virtual void rebalance(int workload); //Rebalance GPUWorkload
+  virtual int compareRefMaps(int iPipeline, int iOrient, int iConv,
+                             int maxParallelConv, mycomplex_t *localmultFFT,
+                             myparam5_t *comp_params, const int startMap = 0);
+  virtual void *malloc_device_host(size_t size);
+  virtual void free_device_host(void *ptr);
+  virtual void rebalance(int workload); // Rebalance GPUWorkload
 
 protected:
-	virtual int deviceInit();
-	virtual int deviceStartRun();
-	virtual int deviceFinishRun();
-	int deviceExit();
-	
+  virtual int deviceInit();
+  virtual int deviceStartRun();
+  virtual int deviceFinishRun();
+  int deviceExit();
+
 private:
-	int selectCudaDevice();
-
-	int deviceInitialized;
-
-	cudaStream_t cudaStream[3];
-	cudaEvent_t cudaEvent[3];
-	cudaEvent_t cudaFFTEvent[2];
-	bioem_RefMap_Mod* pRefMap_device_Mod;
-	bioem_RefMap* gpumap;
-	bioem_Probability* pProb_host;
-	bioem_Probability pProb_device;
-	void* pProb_memory;
-	myfloat_t* pConvMap_device[2];
-
-	mycomplex_t* pRefMapsFFT;
-	mycomplex_t* pConvMapFFT;
-	mycomplex_t* pConvMapFFT_Host;
-	mycuComplex_t* pFFTtmp2[2];
-	myfloat_t* pFFTtmp[2];
-	cufftHandle plan[2][2];
-
-	myfloat_t *maps, *sum, *sumsquare;
-
-	int GPUAlgo;		//GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
-	int GPUAsync;		//Run GPU Asynchronously, do the convolutions on the host in parallel.
-	int GPUDualStream;	//Use two streams to improve paralelism
-	int GPUWorkload;	//Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
-
-	int maxRef;
+  int selectCudaDevice();
+
+  int deviceInitialized;
+
+  cudaStream_t cudaStream[PIPELINE_LVL + 1]; // Streams are used for both
+                                             // PIPELINE and MULTISTREAM control
+  cudaEvent_t cudaEvent[PIPELINE_LVL + 1];
+  cudaEvent_t cudaFFTEvent[MULTISTREAM_LVL];
+  bioem_RefMap *gpumap;
+  bioem_Probability *pProb_host;
+  bioem_Probability pProb_device;
+  void *pProb_memory;
+
+  mycomplex_t *pRefMapsFFT;
+  mycomplex_t *pConvMapFFT;
+  mycomplex_t *pConvMapFFT_Host;
+  mycuComplex_t *pFFTtmp2[MULTISTREAM_LVL];
+  myfloat_t *pFFTtmp[MULTISTREAM_LVL];
+  cufftHandle plan[SPLIT_MAPS_LVL][MULTISTREAM_LVL];
+
+  myparam5_t *pTmp_comp_params;
+
+  myblockGPU_t *pTmp_comp_blocks;
+  int Ncomp_blocks;
+
+  bool *initialized_const; // In order to make sure Constoadd is initialized to
+                           // the first value
+
+  myfloat_t *sum, *sumsquare;
+
+  int GPUAsync; // Run GPU Asynchronously, do the convolutions on the host in
+                // parallel.
+  int GPUDualStream; // Use two streams to improve paralelism
+  int GPUWorkload;   // Percentage of workload to perform on GPU. Default 100.
+                     // Rest is done on processor in parallel.
+
+  int maxRef;
 };
 
 #endif
-
diff --git a/include/defs.h b/include/defs.h
index b7338ca8246126becf8cebce6191b0fa7ca86ec6..70438dd27ad804c763a36e255beb9439bad8aa09 100644
--- a/include/defs.h
+++ b/include/defs.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -14,11 +15,22 @@
 #ifndef BIOEM_DEFS_H
 #define BIOEM_DEFS_H
 
+#define BIOEM_PROB_DOUBLE
 //#define BIOEM_USE_DOUBLE
+//#define DEBUG
+//#define DEBUG_GPU
+//#define DEBUG_PROB
 
-//#define PILAR_DEBUG
+#ifndef BIOEM_PROB_DOUBLE
+typedef float myprob_t;
+#define MY_MPI_FLOAT MPI_FLOAT
+#else
+typedef double myprob_t;
+#define MY_MPI_FLOAT MPI_DOUBLE
+#endif
 
 #ifndef BIOEM_USE_DOUBLE
+#define MIN_PROB -999999.
 typedef float myfloat_t;
 #define myfftw_malloc fftwf_malloc
 #define myfftw_free fftwf_free
@@ -35,9 +47,9 @@ typedef float myfloat_t;
 #define MY_CUFFT_C2R CUFFT_C2R
 #define mycufftExecC2R cufftExecC2R
 #define mycuComplex_t cuComplex
-#define MY_MPI_FLOAT MPI_FLOAT
 #else
 typedef double myfloat_t;
+#define MIN_PROB -999999.
 #define myfftw_malloc fftw_malloc
 #define myfftw_free fftw_free
 #define myfftw_destroy_plan fftw_destroy_plan
@@ -53,17 +65,62 @@ typedef double myfloat_t;
 #define mycufftExecC2R cufftExecZ2D
 #define mycuComplex_t cuDoubleComplex
 #define MY_CUFFT_C2R CUFFT_Z2D
-#define MY_MPI_FLOAT MPI_DOUBLE
 #endif
 typedef myfloat_t mycomplex_t[2];
 
-#define BIOEM_FLOAT_3_PHYSICAL_SIZE 3	//Possible set to 4 for GPU
+#define BIOEM_FLOAT_3_PHYSICAL_SIZE 3 // Possible set to 4 for GPU
 
 struct myfloat3_t
 {
-	myfloat_t pos[BIOEM_FLOAT_3_PHYSICAL_SIZE];
-        myfloat_t quat4;
-   //     myfloat_t prior;
+  myfloat_t pos[BIOEM_FLOAT_3_PHYSICAL_SIZE];
+  myfloat_t quat4;
+  //     myfloat_t prior;
+};
+
+/* myoptions
+Structure for saving options, in order to mimic old Boost program_options
+behaviour
+*/
+struct myoption_t
+{
+  const char *name;
+  int arg;
+  const char *desc;
+  bool hidden;
+};
+
+/* comp_params
+Put all parameters needed for each comparison in a single structure
+This makes code cleaner and requires less GPU transfers
+*/
+struct myparam5_t
+{
+  myfloat_t amp;
+  myfloat_t pha;
+  myfloat_t env;
+  myfloat_t sumC;
+  myfloat_t sumsquareC;
+};
+
+/* comp_block
+Put all parameters created by each inside-block comparison
+This makes code cleaner
+*/
+// For GPUs
+struct myblockGPU_t
+{
+  myprob_t logpro;
+  int id;
+  myprob_t sumExp;
+  myprob_t sumAngles;
+};
+// For CPUs (easier to save value as well)
+struct myblockCPU_t
+{
+  myprob_t logpro;
+  int id;
+  myprob_t sumExp;
+  myfloat_t value;
 };
 
 #ifdef BIOEM_GPUCODE
@@ -85,44 +142,53 @@ struct myfloat3_t
 #define myBlockIdxY 0
 #endif
 
-#define CUDA_THREAD_COUNT 256
-#define CUDA_BLOCK_COUNT 1024 * 16
-#define CUDA_MAX_SHIFT_REDUCE 1024
+#define OUTPUT_PRECISION 4
+
+#define CUDA_THREAD_COUNT_ALGO1 256
+#define CUDA_THREAD_COUNT_ALGO2 512
+#define CUDA_THREAD_MAX 1024
 #define CUDA_FFTS_AT_ONCE 1024
-//#define BIOEM_USE_NVTX
+
+#define PIPELINE_LVL 2
+#define MULTISTREAM_LVL 2
+#define SPLIT_MAPS_LVL 2
 
 /* Autotuning
    Autotuning algorithms:
-    1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing.
-    2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed.
-    3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found.
+    1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples
+   of 5. Taking the value with the best timing.
+    2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the
+   workload are timed, and then the optimal workload balance is computed.
+    3. AlgoBisection = 3; Based on bisection, multiple workload values are
+   tested until the optimal one is found.
  */
 #define AUTOTUNING_ALGORITHM 3
-/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */
+/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de
+ * facto disable recalibration */
 #define RECALIB_FACTOR 200
 /* After how many comparison iterations, comparison duration becomes stable */
 #define FIRST_STABLE 7
 
-static inline void* mallocchk(size_t size)
+static inline void *mallocchk(size_t size)
 {
-	void* ptr = malloc(size);
-	if (ptr == 0)
-	{
-		std::cout << "Memory allocation error\n";
-		exit(1);
-	}
-	return(ptr);
+  void *ptr = malloc(size);
+  if (ptr == 0)
+  {
+    std::cout << "Memory allocation error\n";
+    exit(1);
+  }
+  return (ptr);
 }
 
-static inline void* reallocchk(void* oldptr, size_t size)
+static inline void *reallocchk(void *oldptr, size_t size)
 {
-	void* ptr = realloc(oldptr, size);
-	if (ptr == 0)
-	{
-		std::cout << "Memory allocation error\n";
-		exit(1);
-	}
-	return(ptr);
+  void *ptr = realloc(oldptr, size);
+  if (ptr == 0)
+  {
+    std::cout << "Memory allocation error\n";
+    exit(1);
+  }
+  return (ptr);
 }
 
 #ifndef WITH_OPENMP
diff --git a/include/map.h b/include/map.h
index 88ad682c709abfeaeec11159da73b96f98a7b1ea..9081a332495dd1026634a9fb5a72ae018398d43b 100644
--- a/include/map.h
+++ b/include/map.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -18,7 +19,6 @@
 #include "param.h"
 #include <complex>
 #include <math.h>
-#include <boost/concept_check.hpp>
 
 class bioem_param;
 class bioem;
@@ -26,137 +26,147 @@ class bioem;
 class bioem_RefMap
 {
 public:
-	bioem_RefMap()
-	{
-		maps = NULL;
-		RefMapsFFT = NULL;
-		sum_RefMap = NULL;
-		sumsquare_RefMap = NULL;
-	}
-
-	void freePointers()
-	{
-		if (maps) free(maps);
-		if (sum_RefMap) free(sum_RefMap);
-		if (sumsquare_RefMap) free(sumsquare_RefMap);
-		if (RefMapsFFT) delete[] RefMapsFFT;
-		maps = NULL;
-		sum_RefMap = NULL;
-		sumsquare_RefMap = NULL;
-		RefMapsFFT = NULL;
-	}
-	int readRefMaps(bioem_param& param, const char* filemap);
-	int precalculate(bioem_param& param, bioem& bio);
-	int PreCalculateMapsFFT(bioem_param& param);
-
-	int  read_int(int *currlong, FILE *fin, int swap);
-	int  read_float(float *currfloat, FILE *fin, int swap);
-	int  read_float_empty (FILE *fin);
-	int  read_char_float (float *currfloat, FILE *fin) ;
-	int  test_mrc (const char *vol_file, int swap);
-	int  read_MRC(const char* filename,bioem_param& param);
-
-	mycomplex_t* RefMapsFFT;
-
-	bool readMRC,readMultMRC;
-
-	int ntotRefMap;
-	int numPixels;
-	int refMapSize;
-	myfloat_t* maps;
-	myfloat_t* sum_RefMap;
-	myfloat_t* sumsquare_RefMap;
-
-	__host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(maps[map * refMapSize + x * numPixels + y]);}
-	__host__ __device__ inline const myfloat_t* getp(int map, int x, int y) const {return(&maps[map * refMapSize + x * numPixels]);}
-	__host__ __device__ inline myfloat_t* getmap(int map) {return(&maps[map * refMapSize]);}
+  bioem_RefMap()
+  {
+    maps = NULL;
+    RefMapsFFT = NULL;
+    sum_RefMap = NULL;
+    sumsquare_RefMap = NULL;
+  }
+
+  void freePointers()
+  {
+    if (maps)
+      free(maps);
+    if (sum_RefMap)
+      free(sum_RefMap);
+    if (sumsquare_RefMap)
+      free(sumsquare_RefMap);
+    if (RefMapsFFT)
+      delete[] RefMapsFFT;
+    maps = NULL;
+    sum_RefMap = NULL;
+    sumsquare_RefMap = NULL;
+    RefMapsFFT = NULL;
+  }
+  int readRefMaps(bioem_param &param, const char *filemap);
+  int precalculate(bioem_param &param, bioem &bio);
+  int PreCalculateMapsFFT(bioem_param &param);
+
+  int read_int(int *currlong, FILE *fin, int swap);
+  int read_float(float *currfloat, FILE *fin, int swap);
+  int read_float_empty(FILE *fin);
+  int read_char_float(float *currfloat, FILE *fin);
+  int test_mrc(const char *vol_file, int swap);
+  int read_MRC(const char *filename, bioem_param &param);
+
+  mycomplex_t *RefMapsFFT;
+
+  bool readMRC, readMultMRC;
+
+  int ntotRefMap;
+  int numPixels;
+  int refMapSize;
+  myfloat_t *maps;
+  myfloat_t *sum_RefMap;
+  myfloat_t *sumsquare_RefMap;
+
+  __host__ __device__ inline myfloat_t get(int map, int x, int y) const
+  {
+    return (maps[map * refMapSize + x * numPixels + y]);
+  }
+  __host__ __device__ inline const myfloat_t *getp(int map, int x, int y) const
+  {
+    return (&maps[map * refMapSize + x * numPixels]);
+  }
+  __host__ __device__ inline myfloat_t *getmap(int map)
+  {
+    return (&maps[map * refMapSize]);
+  }
 };
 
 class bioem_RefMap_Mod : public bioem_RefMap
 {
 public:
-	__host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(maps[(x * numPixels + y) * ntotRefMap + map]);}
-
-	void init(const bioem_RefMap& map)
-	{
-		maps = (myfloat_t*) malloc(map.refMapSize * map.ntotRefMap * sizeof(myfloat_t));
-		#pragma omp parallel for
-		for (int i = 0; i < map.ntotRefMap; i++)
-		{
-			for (int j = 0; j < map.numPixels; j++)
-			{
-				for (int k = 0; k < map.numPixels; k++)
-				{
-					maps[(j * map.numPixels + k) * map.ntotRefMap + i] = map.get(i, j, k);
-				}
-			}
-		}
-	}
+  __host__ __device__ inline myfloat_t get(int map, int x, int y) const
+  {
+    return (maps[(x * numPixels + y) * ntotRefMap + map]);
+  }
+
+  void init(const bioem_RefMap &map)
+  {
+    maps = (myfloat_t *) malloc(map.refMapSize * map.ntotRefMap *
+                                sizeof(myfloat_t));
+#pragma omp parallel for
+    for (int i = 0; i < map.ntotRefMap; i++)
+    {
+      for (int j = 0; j < map.numPixels; j++)
+      {
+        for (int k = 0; k < map.numPixels; k++)
+        {
+          maps[(j * map.numPixels + k) * map.ntotRefMap + i] = map.get(i, j, k);
+        }
+      }
+    }
+  }
 };
 
 class bioem_Probability_map
 {
 public:
-	myfloat_t Total;
-	myfloat_t Constoadd;
-	
-	class bioem_Probability_map_max
-	{
-	public:
-		int max_prob_cent_x, max_prob_cent_y, max_prob_orient, max_prob_conv;
-			myfloat_t max_prob_norm,max_prob_mu;
-	} max;
+  myprob_t Total;
+  myprob_t Constoadd;
+
+  class bioem_Probability_map_max
+  {
+  public:
+    int max_prob_cent_x, max_prob_cent_y, max_prob_orient, max_prob_conv;
+    myfloat_t max_prob_norm, max_prob_mu;
+  } max;
 };
 
 class bioem_Probability_angle
 {
 public:
-	myfloat_t forAngles;
-	myfloat_t ConstAngle;
-	myfloat_t priorang;
-};
-
-class bioem_Probability_cc
-{
-public:
-	 myfloat_t forCC;
-         myfloat_t ConstCC;
+  myprob_t forAngles;
+  myprob_t ConstAngle;
 };
 
 class bioem_Probability
 {
 public:
-	int nMaps;
-	int nAngles;
-	int nCC;
-
-	__device__ __host__ bioem_Probability_map& getProbMap(int map) {return(ptr_map[map]);}
-	__device__ __host__ bioem_Probability_angle& getProbAngle(int map, int angle) {return(ptr_angle[angle * nMaps + map]);}
-	__device__ __host__ bioem_Probability_cc& getProbCC(int map, int cc) {return(ptr_cc[cc * nMaps + map]);}
-
-	void* ptr;
-	bioem_Probability_map* ptr_map;
-	bioem_Probability_angle* ptr_angle;
-	bioem_Probability_cc* ptr_cc;
-
-	static size_t get_size(size_t maps, size_t angles, size_t cc, bool writeAngles, bool writeCC)
-	{
-		size_t size = sizeof(bioem_Probability_map);
-		if (writeAngles) size += angles * sizeof(bioem_Probability_angle);
-		if (writeCC) size += cc * sizeof(bioem_Probability_cc);
-		return(maps * size);
-	}
-
-	void init(size_t maps, size_t angles, size_t cc, bioem& bio);
-	void copyFrom(bioem_Probability* from, bioem& bio);
-
-	void set_pointers()
-	{
-		ptr_map = (bioem_Probability_map*) ptr;
-		ptr_angle = (bioem_Probability_angle*) (&ptr_map[nMaps]);
-//		ptr_cc = (bioem_Probability_cc*) (&ptr_angle[nMaps * nAngles]);
-                ptr_cc = (bioem_Probability_cc*) (&ptr_map[nMaps]);
-	}
+  int nMaps;
+  int nAngles;
+
+  __device__ __host__ bioem_Probability_map &getProbMap(int map)
+  {
+    return (ptr_map[map]);
+  }
+  __device__ __host__ bioem_Probability_angle &getProbAngle(int map, int angle)
+  {
+    return (ptr_angle[angle * nMaps + map]);
+  }
+
+  void *ptr;
+  bioem_Probability_map *ptr_map;
+  bioem_Probability_angle *ptr_angle;
+
+  static size_t get_size(size_t maps, size_t angles, int writeAngles)
+  {
+    size_t size = sizeof(bioem_Probability_map);
+    if (writeAngles)
+      size += angles * sizeof(bioem_Probability_angle);
+    return (maps * size);
+  }
+
+  void init(size_t maps, size_t angles, bioem &bio);
+  void copyFrom(bioem_Probability *from, bioem &bio);
+
+  void set_pointers()
+  {
+    ptr_map = (bioem_Probability_map *) ptr;
+    ptr_angle = (bioem_Probability_angle *) (&ptr_map[nMaps]);
+  }
 };
 
 #endif
diff --git a/include/model.h b/include/model.h
index fcfafa29a743f9b6af656aa8e2d5674eefc871a6..46736f9776fe01a64b6fafa46e55e97e52919140 100644
--- a/include/model.h
+++ b/include/model.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
+   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
         Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -16,32 +17,31 @@
 
 #include "defs.h"
 #include "param.h"
-#include <boost/concept_check.hpp>
 
 class bioem_model
 {
 public:
-	class bioem_model_point
-	{
-	public:
-		myfloat3_t point;
-		myfloat_t radius;
-		myfloat_t density;
-	};
-	
-	bioem_model();
-	~bioem_model();
-
-	int readModel(bioem_param& param, const char* filemodel);
-
-	bool readPDB;
-
-	myfloat_t getAminoAcidRad(char *name);
-	myfloat_t getAminoAcidDensity(char *name);
-	myfloat_t NormDen;
-
-	int nPointsModel;
-	bioem_model_point* points;
+  class bioem_model_point
+  {
+  public:
+    myfloat3_t point;
+    myfloat_t radius;
+    myfloat_t density;
+  };
+
+  bioem_model();
+  ~bioem_model();
+
+  int readModel(bioem_param &param, const char *filemodel);
+
+  bool readPDB;
+
+  myfloat_t getAminoAcidRad(char *name);
+  myfloat_t getAminoAcidDensity(char *name);
+  myfloat_t NormDen;
+
+  int nPointsModel;
+  bioem_model_point *points;
 };
 
 #endif
diff --git a/include/param.h b/include/param.h
index 8a952a56fdad73d81bfed65621cc503fb2a2b3cb..5faf1b63f968607e3c22d7087a3dd20b0d3768ac 100644
--- a/include/param.h
+++ b/include/param.h
@@ -1,12 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -17,145 +18,139 @@
 #include "defs.h"
 #include "map.h"
 #include <complex>
-#include <math.h>
 #include <fftw3.h>
+#include <math.h>
 
 using namespace std;
 
 class bioem_param_device
 {
 public:
-// Grids in center assuming equidistance from 0,0
-	int maxDisplaceCenter;
-	int GridSpaceCenter;
-	int NumberPixels;
-	int NumberFFTPixels1D;
-	int NtotDist;
-      
-	myfloat_t Ntotpi;
-	myfloat_t volu;
-	myfloat_t sigmaPriorbctf;
- 	myfloat_t sigmaPriordefo;
-        myfloat_t Priordefcent;
-
-        
-// If to write Probabilities of Angles from Model
-	bool writeAngles;
-        bool writeCC;
-        bool flipped;
-        bool debugterm;
-	int CCdisplace;
-        bool CCwithBayes;
-        bool tousepsf;
-
-
+  // Grids in center assuming equidistance from 0,0
+  int maxDisplaceCenter;
+  int GridSpaceCenter;
+  int NumberPixels;
+  int NumberFFTPixels1D;
+  int NxDisp;
+  int NtotDisp;
+
+  myfloat_t Ntotpi;
+  myfloat_t volu;
+  myfloat_t sigmaPriorbctf;
+  myfloat_t sigmaPriordefo;
+  myfloat_t Priordefcent;
+  myfloat_t sigmaPrioramp;
+  myfloat_t Priorampcent;
+  // If to write Probabilities of Angles from Model
+  int writeAngles;
+  bool tousepsf;
 };
 
 class bioem_param
 {
 public:
-	bioem_param();
-	~bioem_param();
-
-
-	int readParameters(const char* fileinput);
-	int CalculateGridsParam(const char* fileangles);
-	int CalculateRefCTF();
-        int forprintBest(const char* fileinput);
-	void PrepareFFTs();
-        bool doaaradius;
-        bool writeCTF;
-        bool ignoreCCoff;
-	bool nocentermass;
-	bool printrotmod;
-	bool readquatlist;
-	bool showrotatemod;
-	bool notsqure;
-        bool notnormmap;
-	bool usepsf;
-        bool ignorepointsout;
-	bool ignorePDB;
- 
-        myfloat_t elecwavel;
-
-	bioem_param_device param_device;
-
-	int FFTMapSize;
-	int Alignment;
-	mycomplex_t* refCTF;
-	myfloat3_t* CtfParam;
-	size_t getRefCtfCount() {return nTotCTFs * FFTMapSize;}
-	size_t getCtfParamCount() {return nTotCTFs;}
-
-	myfloat_t pixelSize;
-// Priors
-	myfloat_t priorMod;
-	bool yespriorAngles;
-        myfloat_t* angprior;
-
-// Grid Points in Euler angles, assuming uniform sampling d_alpha=d_gamma (in 2pi) & cos(beta)=-1,1
-	int angleGridPointsAlpha;
-	int angleGridPointsBeta;
-
-	int GridPointsQuatern;
-	bool doquater;
-
-	myfloat_t voluang;
-        bool notuniformangles;
-        int NotUn_angles;
-
-	bool withnoise;
-        myfloat_t stnoise;
-//        std::string inanglef;
-//	std::string quatfile;
-
-	int numberGridPointsDisplaceCenter;
-// Grid sampling for the convolution kernel
-
-//        CTF 
-	myfloat_t startBfactor, endBfactor;
-	int numberBfactor;
-        myfloat_t startDefocus, endDefocus;
-        int numberDefocus;
- 
-	//ENVELOPE
-	myfloat_t startGridEnvelop;
-	myfloat_t endGridEnvelop;
-	int numberGridPointsEnvelop;
-	myfloat_t gridEnvelop;
-	//CTF=Amp*cos(phase*x)-sqrt(1-Amp**2)*sin(phase*x)
-	myfloat_t startGridCTF_phase;
-	myfloat_t endGridCTF_phase;
-	int numberGridPointsCTF_phase;
-	myfloat_t gridCTF_phase;
-	myfloat_t startGridCTF_amp;
-	myfloat_t endGridCTF_amp;
-	int numberGridPointsCTF_amp;
-	myfloat_t gridCTF_amp;
-	// Others
-	myfloat3_t* angles;
-	int nTotGridAngles;
-	int nTotCTFs;
-	int nTotCC;
-	int shiftX,shiftY;
-	
-	bool printModel;
-	int printModelOrientation;
-	int printModelConvolution;
-
-	int fft_plans_created;
-	myfftw_plan fft_plan_c2c_forward, fft_plan_c2c_backward, fft_plan_r2c_forward, fft_plan_c2r_backward;
-
-	mycomplex_t** fft_scratch_complex;
-	myfloat_t** fft_scratch_real;
-
-	bool dumpMap, loadMap;
-
-       int ddx,ddy;
-       myfloat_t bestnorm,bestoff;
+  bioem_param();
+  ~bioem_param();
+
+  int readParameters(const char *fileinput);
+  int CalculateGridsParam(const char *fileangles);
+  int CalculateRefCTF();
+  int forprintBest(const char *fileinput);
+  void PrepareFFTs();
+  bool doaaradius;
+  bool writeCTF;
+  bool nocentermass;
+  bool printrotmod;
+  bool readquatlist;
+  bool showrotatemod;
+  bool notnormmap;
+  bool usepsf;
+  bool ignorepointsout;
+  bool ignorePDB;
+
+  myfloat_t elecwavel;
+
+  bioem_param_device param_device;
+
+  int FFTMapSize;
+  int Alignment;
+  mycomplex_t *refCTF;
+  myfloat3_t *CtfParam;
+  size_t getRefCtfCount() { return nTotCTFs * FFTMapSize; }
+  size_t getCtfParamCount() { return nTotCTFs; }
+
+  myfloat_t pixelSize;
+  // Priors
+  myfloat_t priorMod;
+  bool yespriorAngles;
+  myfloat_t *angprior;
+
+  // Grid Points in Euler angles, assuming uniform sampling d_alpha=d_gamma (in
+  // 2pi) & cos(beta)=-1,1
+  int angleGridPointsAlpha;
+  int angleGridPointsBeta;
+
+  int GridPointsQuatern;
+  bool doquater;
+
+  myfloat_t voluang;
+  bool notuniformangles;
+  int NotUn_angles;
+
+  bool withnoise;
+  myfloat_t stnoise;
+  //        std::string inanglef;
+  //	std::string quatfile;
+
+  int numberGridPointsDisplaceCenter;
+  // Grid sampling for the convolution kernel
+
+  //        CTF
+  myfloat_t startBfactor, endBfactor;
+  int numberBfactor;
+  myfloat_t startDefocus, endDefocus;
+  int numberDefocus;
+
+  // ENVELOPE
+  myfloat_t startGridEnvelop;
+  myfloat_t endGridEnvelop;
+  int numberGridPointsEnvelop;
+  myfloat_t gridEnvelop;
+  // CTF=Amp*cos(phase*x)-sqrt(1-Amp**2)*sin(phase*x)
+  myfloat_t startGridCTF_phase;
+  myfloat_t endGridCTF_phase;
+  int numberGridPointsCTF_phase;
+  myfloat_t gridCTF_phase;
+  myfloat_t startGridCTF_amp;
+  myfloat_t endGridCTF_amp;
+  int numberGridPointsCTF_amp;
+  myfloat_t gridCTF_amp;
+  // Others
+  myfloat3_t *angles;
+  int nTotGridAngles;
+  int nTotCTFs;
+  int shiftX, shiftY;
+
+  int nTotParallelConv;
+  int nTotParallelMaps;
+
+  bool printModel;
+  bool BestmapCalcCC;
+
+  int fft_plans_created;
+  myfftw_plan fft_plan_c2c_forward, fft_plan_c2c_backward, fft_plan_r2c_forward,
+      fft_plan_c2r_backward;
+
+  mycomplex_t **fft_scratch_complex;
+  myfloat_t **fft_scratch_real;
+
+  bool dumpMap, loadMap;
+
+  int ddx, ddy;
+  myfloat_t bestnorm, bestoff;
 
 private:
-	void releaseFFTPlans();
+  void releaseFFTPlans();
 };
 
 #endif
diff --git a/include/timer.h b/include/timer.h
index 2875aad47f906cc066bdcd0d49d070097609e31c..06d5b2b6924f79362b1be242c0ae9b36588f6f52 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -1,9 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -11,68 +15,82 @@
 #ifndef TIMER_H
 #define TIMER_H
 
+#include <algorithm>
+#include <cmath>
+#include <numeric>
 #include <stdio.h>
 #include <string>
-#include <numeric>
 #include <vector>
-#include <algorithm>
-#include <cmath>
 
 using namespace std;
 
-class HighResTimer {
+class HighResTimer
+{
 
 public:
-	HighResTimer();
-	~HighResTimer();
-	void Start();
-	void Stop();
-	void Reset();
-	void ResetStart();
-	double GetElapsedTime();
-	double GetCurrentElapsedTime();
+  HighResTimer();
+  ~HighResTimer();
+  void Start();
+  void Stop();
+  void Reset();
+  void ResetStart();
+  double GetElapsedTime();
+  double GetCurrentElapsedTime();
 
 private:
-	static double Frequency;
-	static double GetFrequency();
+  static double Frequency;
+  static double GetFrequency();
 
-	double ElapsedTime;
-	double StartTime;
-	int running;
+  double ElapsedTime;
+  double StartTime;
+  int running;
 };
 
 /* Structure for saving a vector of timings */
-typedef struct _TimeLog {
-	vector<double> vec;
-
-	double sum;
-	double stdev;
-
-	string name;
-}TimeLog;
-enum TS_NAMES{TS_TPROJECTION, TS_PROJECTION, TS_CONVOLUTION, TS_COMPARISON};
+typedef struct _TimeLog
+{
+  vector<double> vec;
+
+  double sum;
+  double stdev;
+
+  string name;
+} TimeLog;
+enum TS_NAMES
+{
+  TS_TPROJECTION,
+  TS_PROJECTION,
+  TS_CONVOLUTION,
+  TS_COMPARISON
+};
 
-/* Structure for saving timings of different parts of code and doing basic statistics on them */
-class TimeStat {
+/* Structure for saving timings of different parts of code and doing basic
+ * statistics on them */
+class TimeStat
+{
 
 public:
-	TimeStat(int Angles, int CTFs) : time(0),tl(NULL) {angles = Angles; ctfs = CTFs;};
-	~TimeStat() {EmptyTimeStat();};
-	void InitTimeLog(int log, int size, string s);
-	void InitTimeStat(int nlogs);
-	void EmptyTimeStat();
-	void inline Add(int log) {tl[log].vec.push_back(time);};
-	void ComputeTimeStat();
-	void PrintTimeStat(int mpi_rank);
-
-	/* Variable for storing times during the execution */
-	double time;
+  TimeStat(int Angles, int CTFs) : time(0), tl(NULL)
+  {
+    angles = Angles;
+    ctfs = CTFs;
+  };
+  ~TimeStat() { EmptyTimeStat(); };
+  void InitTimeLog(int log, int size, string s);
+  void InitTimeStat(int nlogs);
+  void EmptyTimeStat();
+  void inline Add(int log) { tl[log].vec.push_back(time); };
+  void ComputeTimeStat();
+  void PrintTimeStat(int mpi_rank);
+
+  /* Variable for storing times during the execution */
+  double time;
 
 private:
-	TimeLog* tl;
-	int total_logs;
-	int angles;
-	int ctfs;
+  TimeLog *tl;
+  int total_logs;
+  int angles;
+  int ctfs;
 };
 
 #endif
diff --git a/main.cpp b/main.cpp
index 565be80f4018de3c9b288445a8f8a3c9f31b8a6f..6929a95a31bc46bee5ab49bd16310860933e4bd3 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,13 +1,13 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
-   
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -15,27 +15,28 @@
 #ifdef WITH_MPI
 #include <mpi.h>
 
-#define MPI_CHK(expr) \
-	if (expr != MPI_SUCCESS) \
-	{ \
-		fprintf(stderr, "Error in MPI function %s: %d\n", __FILE__, __LINE__); \
-	}
+#define MPI_CHK(expr)                                                          \
+  if (expr != MPI_SUCCESS)                                                     \
+  {                                                                            \
+    fprintf(stderr, "Error in MPI function %s: %d\n", __FILE__, __LINE__);     \
+  }
 #endif
 
+#include <fenv.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include <fenv.h>
 
 #ifdef _WIN32
-#include <Windows.h>
 #include <WinBase.h>
+#include <Windows.h>
 #endif
 
-#include <iostream>
 #include <algorithm>
+#include <iostream>
 #include <iterator>
+
 #include "bioem.h"
 #include "bioem_cuda.h"
 
@@ -53,58 +54,82 @@ int mpi_size = 1;
 
 #include "timer.h"
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 {
-	// **************************************************************************************
-	// *********************************  Main BioEM code **********************************
-	// ************************************************************************************
+// **************************************************************************************
+// *********************************  Main BioEM code
+// **********************************
+// ************************************************************************************
 
 #ifdef WITH_MPI
-	MPI_CHK(MPI_Init(&argc, &argv));
-	MPI_CHK(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank));
-	MPI_CHK(MPI_Comm_size(MPI_COMM_WORLD, &mpi_size));
+  MPI_CHK(MPI_Init(&argc, &argv));
+  MPI_CHK(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank));
+  MPI_CHK(MPI_Comm_size(MPI_COMM_WORLD, &mpi_size));
 #endif
 
 #ifdef _MM_DENORMALS_ZERO_ON
-	#pragma omp parallel
-	{
-		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); //Flush denormals to zero in all OpenMP threads
-	}
+#pragma omp parallel
+  {
+    _MM_SET_DENORMALS_ZERO_MODE(
+        _MM_DENORMALS_ZERO_ON); // Flush denormals to zero in all OpenMP threads
+  }
 #endif
-	HighResTimer timer;
+  HighResTimer timer;
 
-	bioem* bio;
+  bioem *bio;
 #ifdef WITH_CUDA
-	if (getenv("GPU") && atoi(getenv("GPU")))
-	{
-		bio = bioem_cuda_create();
-	}
-	else
+  if (getenv("GPU") && atoi(getenv("GPU")))
+  {
+    bio = bioem_cuda_create();
+  }
+  else
 #endif
-	{
-		bio = new bioem;
-	}
-
-	// ************  Configuration and Pre-calculating necessary objects *****************
-//	if (mpi_rank == 0) printf("Configuring\n");
-	if (bio->configure(argc, argv) == 0)
-	{
-
-		// *******************************  Run BioEM routine ******************************
-		if (mpi_rank == 0) printf("Running\n");
-		timer.Start();
-		bio->run();
-		timer.Stop();
-
-		// ************************************ End **********************************
-		printf ("The code ran for %f seconds (rank %d).\n", timer.GetElapsedTime(), mpi_rank);
-		bio->cleanup();
-	}
-	delete bio;
+  {
+    bio = new bioem;
+  }
+
+  // ************  Configuration and Pre-calculating necessary objects
+  // *****************
+  if (mpi_rank == 0)
+    printf("Configuring\n");
+  if (bio->configure(argc, argv) == 0)
+  {
+    if (bio->needToPrintModel())
+    {
+      if (mpi_size == 1)
+      {
+        bio->printModel();
+      }
+      else
+      {
+        printf("ERROR: Model printing can be performed only if using a single "
+               "MPI process. Please change your execution to use a single MPI "
+               "process or no MPI at all.\n");
+        exit(1);
+      }
+    }
+    else
+    {
+      // *******************************  Run BioEM routine
+      // ******************************
+      if (mpi_rank == 0)
+        printf("Running\n");
+      timer.Start();
+      bio->run();
+      timer.Stop();
+
+      // ************************************ End
+      // **********************************
+      printf("The code ran for %f seconds (rank %d).\n", timer.GetElapsedTime(),
+             mpi_rank);
+      bio->cleanup();
+    }
+  }
+  delete bio;
 
 #ifdef WITH_MPI
-	MPI_Finalize();
+  MPI_Finalize();
 #endif
 
-	return(0);
+  return (0);
 }
diff --git a/map.cpp b/map.cpp
index dbdd7c0043a7b5e891958f4a816af27654390c32..6d155251ee4e067e2436221f737afe1b498c70d5 100644
--- a/map.cpp
+++ b/map.cpp
@@ -1,399 +1,424 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
-   Note: This program contains subroutine "read_MRC" of the Situs 2.7.2 program. 
-   Ref: Willy Wriggers. Using Situs for the Integration of Multi-Resolution Structures. 
+   ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+   Note: This program contains subroutine "read_MRC" of the Situs 2.7.2 program.
+   Ref: Willy Wriggers. Using Situs for the Integration of Multi-Resolution
+   Structures.
    Biophysical Reviews, 2010, Vol. 2, pp. 21-27.
    with a GPL lisences version 3.
 
-
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 
+#include <cstring>
+#include <fftw3.h>
 #include <fstream>
 #include <iostream>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cstring>
-#include <math.h>
-#include <fftw3.h>
 
 #ifdef WITH_OPENMP
 #include <omp.h>
 #endif
 
+#include "bioem.h"
 #include "map.h"
 #include "param.h"
-#include "bioem.h"
 
 using namespace std;
 
-int bioem_RefMap::readRefMaps(bioem_param& param, const char* filemap)
+int bioem_RefMap::readRefMaps(bioem_param &param, const char *filemap)
 {
   numPixels = param.param_device.NumberPixels;
-  refMapSize = param.param_device.NumberPixels * param.param_device.NumberPixels;
+  refMapSize =
+      param.param_device.NumberPixels * param.param_device.NumberPixels;
   // **************************************************************************************
-  // ***********************Reading reference Particle Maps************************
+  // ***********************Reading reference Particle
+  // Maps************************
   // **************************************************************************************
   int allocsize = 0;
   if (param.loadMap)
+  {
+    //************** Loading Map from Binary file *******
+    FILE *fp = fopen("maps.dump", "rb");
+    if (fp == NULL)
     {
-      //************** Loading Map from Binary file *******
-      FILE* fp = fopen("maps.dump", "rb");
-      if (fp == NULL)
-	{
-	  cout << "Error opening dump file\n";
-	  exit(1);
-	}
-      size_t elements_read;
-      elements_read = fread(&ntotRefMap, sizeof(ntotRefMap), 1, fp);
-      if (elements_read != 1)
-      {
-        cout << "Error reading file\n";
-        exit(1);
-      }
-      maps = (myfloat_t*) mallocchk(ntotRefMap * refMapSize * sizeof(myfloat_t));
-      elements_read = fread(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp);
-      if (elements_read != (size_t) ntotRefMap)
-      {
-        cout << "Error reading file\n";
-        exit(1);
-      }
-
-      fclose(fp);
-
-      cout << "Particle Maps read from Map Dump\n";
+      cout << "Error opening dump file\n";
+      exit(1);
+    }
+    size_t elements_read;
+    elements_read = fread(&ntotRefMap, sizeof(ntotRefMap), 1, fp);
+    if (elements_read != 1)
+    {
+      cout << "Error reading file\n";
+      exit(1);
     }
-  else if(readMRC)
+    maps = (myfloat_t *) mallocchk(ntotRefMap * refMapSize * sizeof(myfloat_t));
+    elements_read = fread(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp);
+    if (elements_read != (size_t) ntotRefMap)
     {
-      //************** Reading MRC file *******
-      ntotRefMap=0;
+      cout << "Error reading file\n";
+      exit(1);
+    }
 
-      if(readMultMRC)
-	{
+    fclose(fp);
 
-	  //************** Reading Multiple MRC files *************
-	  cout << "Opening File with MRC list names: " << filemap << "\n"; 
-	  ifstream input(filemap);
+    cout << "Particle Maps read from Map Dump\n";
+  }
+  else if (readMRC)
+  {
+    //************** Reading MRC file *******
+    ntotRefMap = 0;
 
-	  if (!input.good())
-	    {
-	      cout << "Failed to open file contaning MRC names: " << filemap << "\n";
-	      exit(1);
-	    }
+    if (readMultMRC)
+    {
 
-	  char line[512] = {0};
-	  char mapname[100];
-	  char tmpm[10] = {0};
-	  const char* indifile;
+      //************** Reading Multiple MRC files *************
+      cout << "Opening File with MRC list names: " << filemap << "\n";
+      ifstream input(filemap);
 
-	  while (!input.eof())
-	    {
-	      input.getline(line,511);
-	      char tmpVals[100]  = {0};
+      if (!input.good())
+      {
+        cout << "Failed to open file contaning MRC names: " << filemap << "\n";
+        exit(1);
+      }
 
-	      string strline(line);
+      char line[512] = {0};
+      char mapname[100];
+      char tmpm[10] = {0};
+      const char *indifile;
 
-	      //	 cout << "MRC File name:" << strline << "\n";
+      while (!input.eof())
+      {
+        input.getline(line, 511);
+        char tmpVals[100] = {0};
 
+        string strline(line);
 
-	      strncpy (tmpVals,line,99);
-	      sscanf (tmpVals,"%99c",mapname);
+        //	 cout << "MRC File name:" << strline << "\n";
 
-	      // Check for last line
-	      strncpy (tmpm,mapname,3);
+        strncpy(tmpVals, line, 99);
+        sscanf(tmpVals, "%99c", mapname);
 
-	      if(strcmp(tmpm,"XXX")!=0)
-		{
-		  indifile=strline.c_str();
-          
-		  //   size_t foundpos= strline.find("mrc");
-		  //   size_t endpos = strline.find_last_not_of(" \t");
-       
+        // Check for last line
+        strncpy(tmpm, mapname, 3);
 
-		  //Reading Multiple MRC
-		  read_MRC(indifile,param);
-		}
-	      for(int i=0;i<3;i++)mapname[i] = 'X';
-	      for(int i=3;i<100;i++)mapname[i] = 0;
+        if (strcmp(tmpm, "XXX") != 0)
+        {
+          indifile = strline.c_str();
 
-	    }
-	  cout << "\n+++++++++++++++++++++++++++++++++++++++++++ \n";
-	  cout << "Particle Maps read from MULTIPLE MRC Files in: " << filemap << "\n" ;
-	}
-      else
-	{
+          //   size_t foundpos= strline.find("mrc");
+          //   size_t endpos = strline.find_last_not_of(" \t");
 
-	  string strfilename(filemap);
+          // Reading Multiple MRC
+          read_MRC(indifile, param);
+        }
+        for (int i = 0; i < 3; i++)
+          mapname[i] = 'X';
+        for (int i = 3; i < 100; i++)
+          mapname[i] = 0;
+      }
+      cout << "\n+++++++++++++++++++++++++++++++++++++++++++ \n";
+      cout << "Particle Maps read from MULTIPLE MRC Files in: " << filemap
+           << "\n";
+    }
+    else
+    {
 
-	  size_t foundpos= strfilename.find("mrc");
-	  size_t endpos = strfilename.find_last_not_of(" \t");
+      string strfilename(filemap);
 
-	  if(foundpos > endpos){
-	    cout << "Warining:::: mrc extension NOT dectected in file name::" << filemap <<" \n";
-	    cout << "Warining::::  Are you sure you want to read an MRC? \n";
-	  }
+      size_t foundpos = strfilename.find("mrc");
+      size_t endpos = strfilename.find_last_not_of(" \t");
 
-	  read_MRC(filemap,param);
-	  cout << "\n++++++++++++++++++++++++++++++++++++++++++ \n";
-	  cout << "Particle Maps read from ONE MRC File: " << filemap << "\n" ;
-	}
+      if (foundpos > endpos)
+      {
+        cout << "Warining:::: mrc extension NOT dectected in file name::"
+             << filemap << " \n";
+        cout << "Warining::::  Are you sure you want to read an MRC? \n";
+      }
+
+      read_MRC(filemap, param);
+      cout << "\n++++++++++++++++++++++++++++++++++++++++++ \n";
+      cout << "Particle Maps read from ONE MRC File: " << filemap << "\n";
     }
+  }
   else
+  {
+    //************** Reading Text file *************
+    int nummap = -1;
+    int lasti = 0;
+    int lastj = 0;
+    ifstream input(filemap);
+    if (!input.good())
     {
-      //************** Reading Text file *************
-      int nummap = -1;
-      int lasti = 0;
-      int lastj = 0;
-      ifstream input(filemap);
-      if (!input.good())
-	{
-	  cout << "Particle Maps Failed to open file" << endl ;
-	  exit(1);
-	}
+      cout << "Particle Maps Failed to open file" << endl;
+      exit(1);
+    }
 
-      char line[512] = {0};
-      char tmpLine[512] = {0};
-      bool first=true;
+    char line[512] = {0};
+    char tmpLine[512] = {0};
+    bool first = true;
 
-      int countpix=0;
+    int countpix = 0;
 
-      while (!input.eof())
-	{
-	  input.getline(line, 511);
-
-	  strncpy(tmpLine, line, strlen(line));
-	  char *token = strtok(tmpLine, " ");
-
-	  if(first){
-	    if (strcmp(token, "PARTICLE") != 0) {
-	      cout << "Missing correct Standard Map Format: PARTICLE HEADER\n"<< endl ;
-	      exit(1);
-	    }
-	    first=false;
-	  }
-
-
-	  if (strcmp(token, "PARTICLE") == 0) // to count the number of maps
-	    {
-	      nummap++;
-	      countpix=0;
-	      if (allocsize == 0)
-		{
-		  allocsize = 64;
-		  maps = (myfloat_t*) mallocchk(refMapSize * sizeof(myfloat_t) * allocsize);
-		}
-	      else if (nummap + 1 >= allocsize)
-		{
-		  allocsize *= 2;
-		  maps = (myfloat_t*) reallocchk(maps, refMapSize * sizeof(myfloat_t) * allocsize);
-		}
-	      if (nummap % 128 == 0)
-		{
-		  cout << "..." << nummap << "\n";
-		}
-	      if(lasti+1 != param.param_device.NumberPixels && lastj+1 != param.param_device.NumberPixels && nummap > 0)
-		{
-		  cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE ( " << param.param_device.NumberPixels << ", i " << lasti << ", j " << lastj << ")" << "\n";
-		  exit(1);
-		}
-	    }
-	  else
-	    {
-	      int i, j;
-	      float z;
-	      
-	      char tmpVals[36]  = {0};
-
-	      strncpy (tmpVals, line, 8);
-	      sscanf (tmpVals, "%d", &i);
-
-	      strncpy (tmpVals, line + 8, 8);
-	      sscanf (tmpVals, "%d", &j);
-
-	      strncpy (tmpVals, line + 16, 16);
-	      sscanf (tmpVals, "%f", &z);
-	      //checking for Map limits
-	      if(i > -1 && i  < param.param_device.NumberPixels && j > -1 && j  < param.param_device.NumberPixels)
-		{
-		  countpix++;
-		  maps[nummap * refMapSize + i * numPixels + j] = (myfloat_t) z;
-		  lasti = i;
-		  lastj = j;
-		  //	 cout << countpix << " " << param.param_device.NumberPixels*param.param_device.NumberPixels << "\n";
-		}
-	      else
-		{
-		  cout << "PROBLEM READING MAP (Map number " << nummap << ", i " << i << ", j " << j << ")" << "\n";
-		  exit(1);
-		}
-	    }
-	}
-      if(lasti != param.param_device.NumberPixels-1 || lastj != param.param_device.NumberPixels-1 || countpix != param.param_device.NumberPixels*param.param_device.NumberPixels +1 )
-	{
-	  cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE ( " << param.param_device.NumberPixels << ", i " << lasti << ", j " << lastj << ")" << "\n";
-	  exit(1);
-	}
-      cout << ".";
-      ntotRefMap = nummap + 1;
-      maps = (myfloat_t*) reallocchk(maps, refMapSize * sizeof(myfloat_t) * ntotRefMap);
-      cout << "Particle Maps read from Standard File: " << ntotRefMap << "\n";
+    while (!input.eof())
+    {
+      input.getline(line, 511);
+
+      strncpy(tmpLine, line, strlen(line));
+      char *token = strtok(tmpLine, " ");
+
+      if (first)
+      {
+        if (strcmp(token, "PARTICLE") != 0)
+        {
+          cout << "Missing correct Standard Map Format: PARTICLE HEADER\n"
+               << endl;
+          exit(1);
+        }
+        first = false;
+      }
+
+      if (strcmp(token, "PARTICLE") == 0) // to count the number of maps
+      {
+        nummap++;
+        countpix = 0;
+        if (allocsize == 0)
+        {
+          allocsize = 64;
+          maps = (myfloat_t *) mallocchk(refMapSize * sizeof(myfloat_t) *
+                                         allocsize);
+        }
+        else if (nummap + 1 >= allocsize)
+        {
+          allocsize *= 2;
+          maps = (myfloat_t *) reallocchk(maps, refMapSize * sizeof(myfloat_t) *
+                                                    allocsize);
+        }
+        if (nummap % 128 == 0)
+        {
+          cout << "..." << nummap << "\n";
+        }
+        if (lasti + 1 != param.param_device.NumberPixels &&
+            lastj + 1 != param.param_device.NumberPixels && nummap > 0)
+        {
+          cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE "
+                  "( "
+               << param.param_device.NumberPixels << ", i " << lasti << ", j "
+               << lastj << ")"
+               << "\n";
+          exit(1);
+        }
+      }
+      else
+      {
+        int i, j;
+        float z;
+
+        char tmpVals[36] = {0};
+
+        strncpy(tmpVals, line, 8);
+        sscanf(tmpVals, "%d", &i);
+
+        strncpy(tmpVals, line + 8, 8);
+        sscanf(tmpVals, "%d", &j);
+
+        strncpy(tmpVals, line + 16, 16);
+        sscanf(tmpVals, "%f", &z);
+        // checking for Map limits
+        if (i > -1 && i < param.param_device.NumberPixels && j > -1 &&
+            j < param.param_device.NumberPixels)
+        {
+          countpix++;
+          maps[nummap * refMapSize + i * numPixels + j] = (myfloat_t) z;
+          lasti = i;
+          lastj = j;
+          //	 cout << countpix << " " <<
+          // param.param_device.NumberPixels*param.param_device.NumberPixels <<
+          //"\n";
+        }
+        else
+        {
+          cout << "PROBLEM READING MAP (Map number " << nummap << ", i " << i
+               << ", j " << j << ")"
+               << "\n";
+          exit(1);
+        }
+      }
     }
+    if (lasti != param.param_device.NumberPixels - 1 ||
+        lastj != param.param_device.NumberPixels - 1 ||
+        countpix !=
+            param.param_device.NumberPixels * param.param_device.NumberPixels +
+                1)
+    {
+      cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE ( "
+           << param.param_device.NumberPixels << ", i " << lasti << ", j "
+           << lastj << ")"
+           << "\n";
+      exit(1);
+    }
+    cout << ".";
+    ntotRefMap = nummap + 1;
+    maps = (myfloat_t *) reallocchk(maps, refMapSize * sizeof(myfloat_t) *
+                                              ntotRefMap);
+    cout << "Particle Maps read from Standard File: " << ntotRefMap << "\n";
+  }
 
   //************* If Dumping Maps *********************
   if (param.dumpMap)
+  {
+    FILE *fp = fopen("maps.dump", "w+b");
+    if (fp == NULL)
     {
-      FILE* fp = fopen("maps.dump", "w+b");
-      if (fp == NULL)
-	{
-	  cout << "Error opening dump file\n";
-	  exit(1);
-	}
-      fwrite(&ntotRefMap, sizeof(ntotRefMap), 1, fp);
-      fwrite(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp);
-      fclose(fp);
+      cout << "Error opening dump file\n";
+      exit(1);
     }
+    fwrite(&ntotRefMap, sizeof(ntotRefMap), 1, fp);
+    fwrite(maps, sizeof(myfloat_t) * refMapSize, ntotRefMap, fp);
+    fclose(fp);
+  }
 
   //*********** To Debug with few Maps ********************
 
   if (getenv("BIOEM_DEBUG_NMAPS"))
-    {
-      ntotRefMap = atoi(getenv("BIOEM_DEBUG_NMAPS"));
-    }
+  {
+    ntotRefMap = atoi(getenv("BIOEM_DEBUG_NMAPS"));
+  }
+  param.nTotParallelMaps = min(CUDA_FFTS_AT_ONCE, ntotRefMap);
 
   cout << "Total Number of particles: " << ntotRefMap;
   cout << "\n+++++++++++++++++++++++++++++++++++++++++++ \n";
 
-  return(0);
+  return (0);
 }
 
-int bioem_RefMap::PreCalculateMapsFFT(bioem_param& param)
+int bioem_RefMap::PreCalculateMapsFFT(bioem_param &param)
 {
   // **************************************************************************************
-  // ********** Routine that pre-calculates Reference maps FFT for Convolution/ Comparison **********************
+  // ********** Routine that pre-calculates Reference maps FFT for Convolution/
+  // Comparison **********************
   // ************************************************************************************
 
   RefMapsFFT = new mycomplex_t[ntotRefMap * param.FFTMapSize];
 
 #pragma omp parallel for
-  for (int iRefMap = 0; iRefMap < ntotRefMap ; iRefMap++)
+  for (int iRefMap = 0; iRefMap < ntotRefMap; iRefMap++)
+  {
+    const int num = omp_get_thread_num();
+    myfloat_t *localMap = param.fft_scratch_real[num];
+    mycomplex_t *localout = param.fft_scratch_complex[num];
+
+    // Assigning localMap values to padded Map
+    for (int i = 0; i < param.param_device.NumberPixels; i++)
     {
-      const int num = omp_get_thread_num();
-      myfloat_t* localMap = param.fft_scratch_real[num];
-      mycomplex_t* localout = param.fft_scratch_complex[num];
-
-      //Assigning localMap values to padded Map
-      for(int i = 0; i < param.param_device.NumberPixels; i++)
-	{
-	  for(int j = 0; j < param.param_device.NumberPixels; j++)
-	    {
-	      localMap[i * param.param_device.NumberPixels + j] = maps[iRefMap * refMapSize + i * param.param_device.NumberPixels + j];
-	    }
-	}
-
-      //Calling FFT_Forward
-      myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localMap, localout);
-
-      //Saving the Reference CTFs (RefMap array possibly has incorrect alignment, so we copy here. Stupid but in fact does not matter.)
-      mycomplex_t* RefMap = &RefMapsFFT[iRefMap * param.FFTMapSize];
-               
-      for(int i = 0; i < param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D ; i++ )
-	{
-	  RefMap[i][0] = localout[i][0]; 
-	  RefMap[i][1] = localout[i][1]; 
-	}
+      for (int j = 0; j < param.param_device.NumberPixels; j++)
+      {
+        localMap[i * param.param_device.NumberPixels + j] =
+            maps[iRefMap * refMapSize + i * param.param_device.NumberPixels +
+                 j];
+      }
     }
 
-  return(0);
+    // Calling FFT_Forward
+    myfftw_execute_dft_r2c(param.fft_plan_r2c_forward, localMap, localout);
+
+    // Saving the Reference CTFs (RefMap array possibly has incorrect alignment,
+    // so we copy here. Stupid but in fact does not matter.)
+    mycomplex_t *RefMap = &RefMapsFFT[iRefMap * param.FFTMapSize];
+
+    for (int i = 0; i < param.param_device.NumberPixels *
+                            param.param_device.NumberFFTPixels1D;
+         i++)
+    {
+      RefMap[i][0] = localout[i][0];
+      RefMap[i][1] = localout[i][1];
+    }
+  }
+
+  return (0);
 }
 
-int bioem_RefMap::precalculate(bioem_param& param, bioem& bio)
+int bioem_RefMap::precalculate(bioem_param &param, bioem &bio)
 {
   // **************************************************************************************
-  // *******************************Precalculating Routine for Maps************************
+  // *******************************Precalculating Routine for
+  // Maps************************
   // **************************************************************************************
 
-  sum_RefMap = (myfloat_t*) mallocchk(sizeof(myfloat_t) * ntotRefMap);
-  sumsquare_RefMap = (myfloat_t*) mallocchk(sizeof(myfloat_t) * ntotRefMap);
+  sum_RefMap = (myfloat_t *) mallocchk(sizeof(myfloat_t) * ntotRefMap);
+  sumsquare_RefMap = (myfloat_t *) mallocchk(sizeof(myfloat_t) * ntotRefMap);
 
-  //Precalculating cross-correlations of maps
+// Precalculating cross-correlations of maps
 #pragma omp parallel for
-  for (int iRefMap = 0; iRefMap < ntotRefMap ; iRefMap++)
-    {
-      myfloat_t sum, sumsquare;
-      bio.calcross_cor(getmap(iRefMap), sum, sumsquare);
-      //Storing Crosscorrelations in Map class
-      sum_RefMap[iRefMap] = sum;
-      sumsquare_RefMap[iRefMap] = sumsquare;
-    }
+  for (int iRefMap = 0; iRefMap < ntotRefMap; iRefMap++)
+  {
+    myfloat_t sum, sumsquare;
+    bio.calcross_cor(getmap(iRefMap), sum, sumsquare);
+    // Storing Crosscorrelations in Map class
+    sum_RefMap[iRefMap] = sum;
+    sumsquare_RefMap[iRefMap] = sumsquare;
+  }
 
   // Precalculating Maps in Fourier space
-  if (bio.FFTAlgo)
-    {
-      PreCalculateMapsFFT(param);
-      free(maps);
-      maps = NULL;
-    }
+  PreCalculateMapsFFT(param);
+  free(maps);
+  maps = NULL;
 
-  return(0);
+  return (0);
 }
 
-void bioem_Probability::init(size_t maps, size_t angles, size_t cc, bioem& bio)
+void bioem_Probability::init(size_t maps, size_t angles, bioem &bio)
 {
   //********** Initializing pointers *******************
   nMaps = maps;
   nAngles = angles;
-  nCC = cc;
-  ptr = bio.malloc_device_host(get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC));
-  if (bio.DebugOutput >= 1) cout << "Allocation #Maps " << maps << " #Angles " << angles << " #cross.cor " << cc << "\n";
-  //<< " == " << get_size(maps, angles, cc, bio.param.param_device.writeAngles, bio.param.param_device.writeCC)<< "\n";
+  ptr = bio.malloc_device_host(
+      get_size(maps, angles, bio.param.param_device.writeAngles));
+  if (bio.DebugOutput >= 1)
+    cout << "Allocation #Maps " << maps << " #Angles " << angles << "\n";
   set_pointers();
 }
 
-void bioem_Probability::copyFrom(bioem_Probability* from, bioem& bio)
+void bioem_Probability::copyFrom(bioem_Probability *from, bioem &bio)
 {
 
-  bioem_Probability_map& pProbMap = getProbMap(0);
-  bioem_Probability_map& pProbMapFrom = from->getProbMap(0);
+  bioem_Probability_map &pProbMap = getProbMap(0);
+  bioem_Probability_map &pProbMapFrom = from->getProbMap(0);
   memcpy(&pProbMap, &pProbMapFrom, from->nMaps * sizeof(bioem_Probability_map));
 
   if (bio.param.param_device.writeAngles)
+  {
+    for (int iOrient = 0; iOrient < nAngles; iOrient++)
     {
-      for (int iOrient = 0; iOrient < nAngles; iOrient ++)
-	{
-	  bioem_Probability_angle& pProbAngle = getProbAngle(0, iOrient);
-	  bioem_Probability_angle& pProbAngleFrom = from->getProbAngle(0, iOrient);
-	  memcpy(&pProbAngle, &pProbAngleFrom, from->nMaps * sizeof(bioem_Probability_angle));
-	}
-    }
-
-  if (bio.param.param_device.writeCC)
-    {
-      for (int iCC = 0; iCC < nCC; iCC ++)
-	{
-	  bioem_Probability_cc& pProbCC = getProbCC(0, iCC);
-	  bioem_Probability_cc& pProbCCFrom = from->getProbCC(0, iCC);
-	  memcpy(&pProbCC, &pProbCCFrom, from->nMaps * sizeof(bioem_Probability_cc));
-	}
+      bioem_Probability_angle &pProbAngle = getProbAngle(0, iOrient);
+      bioem_Probability_angle &pProbAngleFrom = from->getProbAngle(0, iOrient);
+      memcpy(&pProbAngle, &pProbAngleFrom,
+             from->nMaps * sizeof(bioem_Probability_angle));
     }
+  }
 }
 
-int bioem_RefMap::read_MRC(const char* filename,bioem_param& param)
+int bioem_RefMap::read_MRC(const char *filename, bioem_param &param)
 {
 
-  /* 	 subroutine "read_MRC" of the Situs 2.7.2 program. 
-	 Ref: Willy Wriggers. Using Situs for the Integration of Multi-Resolution Structures. 
-	 Biophysical Reviews, 2010, Vol. 2, pp. 21-27.*/
-
+  /* 	 subroutine "read_MRC" of the Situs 2.7.2 program.
+         Ref: Willy Wriggers. Using Situs for the Integration of
+     Multi-Resolution Structures.
+         Biophysical Reviews, 2010, Vol. 2, pp. 21-27.*/
 
-  myfloat_t st,st2;
+  myfloat_t st, st2;
   unsigned long count;
   FILE *fin;
   float currfloat;
@@ -401,66 +426,78 @@ int bioem_RefMap::read_MRC(const char* filename,bioem_param& param)
   float xlen, ylen, zlen;
   int mode, ncstart, nrstart, nsstart, ispg, nsymbt, lskflg;
   float a_tmp, b_tmp, g_tmp;
-  int  mx, my, mz,mapc, mapr, maps_local;
+  int mx, my, mz, mapc, mapr, maps_local;
   float dmin, dmax, dmean;
   int n_range_viol0, n_range_viol1;
 
   fin = fopen(filename, "rb");
-  if( fin == NULL ) {
+  if (fin == NULL)
+  {
     cout << "ERROR opening MRC: " << filename;
     exit(1);
   }
-  n_range_viol0 = test_mrc(filename,0);
-  n_range_viol1 = test_mrc(filename,1);
+  n_range_viol0 = test_mrc(filename, 0);
+  n_range_viol1 = test_mrc(filename, 1);
 
-  if (n_range_viol0 < n_range_viol1) { //* guess endianism
+  if (n_range_viol0 < n_range_viol1)
+  { //* guess endianism
     swap = 0;
-    if (n_range_viol0 > 0) {
-      printf(" Warning: %i header field range violations detected in file %s \n", n_range_viol0,filename);
+    if (n_range_viol0 > 0)
+    {
+      printf(
+          " Warning: %i header field range violations detected in file %s \n",
+          n_range_viol0, filename);
     }
-  } else {
+  }
+  else
+  {
     swap = 1;
-    if (n_range_viol1 > 0) {
-      printf("Warning: %i header field range violations detected in file %s \n", n_range_viol1,filename);
+    if (n_range_viol1 > 0)
+    {
+      printf("Warning: %i header field range violations detected in file %s \n",
+             n_range_viol1, filename);
     }
   }
   printf("\n+++++++++++++++++++++++++++++++++++++++++++\n");
   printf("Reading Information from MRC: %s \n", filename);
-  header_ok *= read_int(&nc,fin,swap);
-  header_ok *= read_int(&nr,fin,swap);
-  header_ok *= read_int(&ns,fin,swap);
-  header_ok *= read_int(&mode,fin,swap);
-  header_ok *= read_int(&ncstart,fin,swap);
-  header_ok *= read_int(&nrstart,fin,swap);
-  header_ok *= read_int(&nsstart,fin,swap);
-  header_ok *= read_int(&mx,fin,swap);
-  header_ok *= read_int(&my,fin,swap);
-  header_ok *= read_int(&mz,fin,swap);
-  header_ok *= read_float(&xlen,fin,swap);
-  header_ok *= read_float(&ylen,fin,swap);
-  header_ok *= read_float(&zlen,fin,swap);
-  header_ok *= read_float(&a_tmp,fin,swap);
-  header_ok *= read_float(&b_tmp,fin,swap);
-  header_ok *= read_float(&g_tmp,fin,swap);
-  header_ok *= read_int(&mapc,fin,swap);
-  header_ok *= read_int(&mapr,fin,swap);
-  header_ok *= read_int(&maps_local,fin,swap);
-  header_ok *= read_float(&dmin,fin,swap);
-  header_ok *= read_float(&dmax,fin,swap);
-  header_ok *= read_float(&dmean,fin,swap);
-  header_ok *= read_int(&ispg,fin,swap);
-  header_ok *= read_int(&nsymbt,fin,swap);
-  header_ok *= read_int(&lskflg,fin,swap);
-
-  printf("Number Columns  = %8d \n",nc);
-  printf("Number Rows     = %8d \n",nr);
-  printf("Number Sections = %8d \n",ns);
-  printf("MODE = %4d (only data type mode 2: 32-bit)\n",mode);
-  printf("NSYMBT = %4d (# bytes symmetry operators)\n",nsymbt);
-
-  /* printf("  NCSTART = %8d  (index of first column, counting from 0)\n",ncstart);
-     printf(">  NRSTART = %8d  (index of first row, counting from 0)\n",nrstart);
-     printf("  NSSTART = %8d  (index of first section, counting from 0)\n",nsstart);
+  header_ok *= read_int(&nc, fin, swap);
+  header_ok *= read_int(&nr, fin, swap);
+  header_ok *= read_int(&ns, fin, swap);
+  header_ok *= read_int(&mode, fin, swap);
+  header_ok *= read_int(&ncstart, fin, swap);
+  header_ok *= read_int(&nrstart, fin, swap);
+  header_ok *= read_int(&nsstart, fin, swap);
+  header_ok *= read_int(&mx, fin, swap);
+  header_ok *= read_int(&my, fin, swap);
+  header_ok *= read_int(&mz, fin, swap);
+  header_ok *= read_float(&xlen, fin, swap);
+  header_ok *= read_float(&ylen, fin, swap);
+  header_ok *= read_float(&zlen, fin, swap);
+  header_ok *= read_float(&a_tmp, fin, swap);
+  header_ok *= read_float(&b_tmp, fin, swap);
+  header_ok *= read_float(&g_tmp, fin, swap);
+  header_ok *= read_int(&mapc, fin, swap);
+  header_ok *= read_int(&mapr, fin, swap);
+  header_ok *= read_int(&maps_local, fin, swap);
+  header_ok *= read_float(&dmin, fin, swap);
+  header_ok *= read_float(&dmax, fin, swap);
+  header_ok *= read_float(&dmean, fin, swap);
+  header_ok *= read_int(&ispg, fin, swap);
+  header_ok *= read_int(&nsymbt, fin, swap);
+  header_ok *= read_int(&lskflg, fin, swap);
+
+  printf("Number Columns  = %8d \n", nc);
+  printf("Number Rows     = %8d \n", nr);
+  printf("Number Sections = %8d \n", ns);
+  printf("MODE = %4d (only data type mode 2: 32-bit)\n", mode);
+  printf("NSYMBT = %4d (# bytes symmetry operators)\n", nsymbt);
+
+  /* printf("  NCSTART = %8d  (index of first column, counting from
+     0)\n",ncstart);
+     printf(">  NRSTART = %8d  (index of first row, counting from
+     0)\n",nrstart);
+     printf("  NSSTART = %8d  (index of first section, counting from
+     0)\n",nsstart);
      printf("       MX = %8d  (# of X intervals in unit cell)\n",mx);
      printf("       MY = %8d  (# of Y intervals in unit cell)\n",my);
      printf("       MZ = %8d  (# of Z intervals in unit cell)\n",mz);
@@ -478,136 +515,162 @@ int bioem_RefMap::read_MRC(const char* filename,bioem_param& param)
      printf("    DMEAN = %8.3f  (mean density value - ignored)\n",dmean);
      printf("     ISPG = %8d  (space group number - ignored)\n",ispg);
      printf("   NSYMBT = %8d  (# bytes storing symmetry operators)\n",nsymbt);
-     printf("   LSKFLG = %8d  (skew matrix flag: 0:none, 1:follows)\n",lskflg);*/
+     printf("   LSKFLG = %8d  (skew matrix flag: 0:none,
+     1:follows)\n",lskflg);*/
 
-  if (header_ok == 0) {
+  if (header_ok == 0)
+  {
     cout << "ERROR reading MRC header: " << filename;
     exit(1);
   }
 
-  if(nr!=param.param_device.NumberPixels || nc!=param.param_device.NumberPixels )
-    {
-      cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE ( " << param.param_device.NumberPixels << ", i " << nc << ", j " << nr << ")" << "\n";
-      if(!param.notsqure)  exit(1);
-    }
+  if (nr != param.param_device.NumberPixels ||
+      nc != param.param_device.NumberPixels)
+  {
+    cout << "PROBLEM INCONSISTENT NUMBER OF PIXELS IN MAPS AND INPUTFILE ( "
+         << param.param_device.NumberPixels << ", i " << nc << ", j " << nr
+         << ")"
+         << "\n";
+    exit(1);
+  }
 
   if (ntotRefMap == 0)
-    {
-      maps = (myfloat_t*) mallocchk(refMapSize * sizeof(myfloat_t) * ns);
-    }
+  {
+    maps = (myfloat_t *) mallocchk(refMapSize * sizeof(myfloat_t) * ns);
+  }
   else
-    {
-      maps = (myfloat_t*) reallocchk(maps, refMapSize * sizeof(myfloat_t) * (ntotRefMap + ns));
-    }
+  {
+    maps = (myfloat_t *) reallocchk(maps, refMapSize * sizeof(myfloat_t) *
+                                              (ntotRefMap + ns));
+  }
 
-  if(mode!=2)
-    {
-      cout << "ERROR with MRC mode " << mode << "\n";
-      cout << "Currently mode 2 is the only one allowed" << "\n";
-      exit(1);
-    }
+  if (mode != 2)
+  {
+    cout << "ERROR with MRC mode " << mode << "\n";
+    cout << "Currently mode 2 is the only one allowed"
+         << "\n";
+    exit(1);
+  }
   else
+  {
+    rewind(fin);
+    for (count = 0; count < 256; ++count)
+      if (read_float_empty(fin) == 0)
+      {
+        cout << "ERROR Converting Data: " << filename;
+        exit(1);
+      }
+
+    for (count = 0; count < (unsigned long) nsymbt; ++count)
+      if (read_char_float(&currfloat, fin) == 0)
+      {
+        cout << "ERROR Converting Data: " << filename;
+        exit(1);
+      }
+
+    for (int nmap = 0; nmap < ns; nmap++)
     {
-      rewind (fin);
-      for (count=0; count<256; ++count) if (read_float_empty(fin)==0) {
-	  cout << "ERROR Converting Data: " <<  filename;
-	  exit(1);
-	}
-
-      for (count=0; count<(unsigned long)nsymbt; ++count) if (read_char_float(&currfloat,fin)==0) {
-	  cout << "ERROR Converting Data: " <<  filename;
-	  exit(1);
-	}
-
-      for ( int nmap = 0 ; nmap < ns ; nmap ++ )
-	{
-	  st=0.0;
-	  st2=0.0;
-	  for ( int j = 0 ; j < nr ; j ++ )
-	    for ( int i = 0 ; i < nc ; i ++ )
-	      {
-		if (read_float(&currfloat,fin,swap)==0)
-		  {
-		    cout << "ERROR Converting Data: " <<  filename;
-		    exit(1);
-		  }
-		else
-		  {
-		    if(!param.notsqure){
-		      maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] = (myfloat_t) currfloat;
-		      st += currfloat;
-		      st2 += currfloat*currfloat;
-		    } else {
-		      if( i > 595 && i < 675 && j > 1250 && j< 1330 && nmap >230 && nmap <310)cout << "map1: " << i << " "<< j << " " << nmap << " " << currfloat <<"\n";
-		    }
-		  }
-	      }
-	  if(param.notsqure)exit(1);
-	  //Normaling maps to zero mean and unit standard deviation
-	  if(!param.notnormmap){
-	    st /= float(nr*nc);
-	    st2 = sqrt(st2 / float(nr * nc) - st * st);
-	    for ( int j = 0 ; j < nr ; j ++ ) for ( int i = 0 ; i < nc ; i ++ ){
-		maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] = maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] / st2 - st/st2;
-		//cout <<"MAP:: " << i << " " << j << " " <<  maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j]  << "\n";
-	      }
-	  }
-	}
-      ntotRefMap += ns ;
-      //  cout << ntotRefMap << "\n";
+      st = 0.0;
+      st2 = 0.0;
+      for (int j = 0; j < nr; j++)
+        for (int i = 0; i < nc; i++)
+        {
+          if (read_float(&currfloat, fin, swap) == 0)
+          {
+            cout << "ERROR Converting Data: " << filename;
+            exit(1);
+          }
+          else
+          {
+            maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] =
+                (myfloat_t) currfloat;
+            st += currfloat;
+            st2 += currfloat * currfloat;
+          }
+        }
+      // Normaling maps to zero mean and unit standard deviation
+      if (!param.notnormmap)
+      {
+        st /= float(nr * nc);
+        st2 = sqrt(st2 / float(nr * nc) - st * st);
+        for (int j = 0; j < nr; j++)
+          for (int i = 0; i < nc; i++)
+          {
+            maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] =
+                maps[(nmap + ntotRefMap) * refMapSize + i * numPixels + j] /
+                    st2 -
+                st / st2;
+            // cout <<"MAP:: " << i << " " << j << " " <<  maps[(nmap +
+            // ntotRefMap) * refMapSize + i * numPixels + j]  << "\n";
+          }
+      }
     }
-  fclose (fin);
+    ntotRefMap += ns;
+    //  cout << ntotRefMap << "\n";
+  }
+  fclose(fin);
 
-  return(0);
+  return (0);
 }
 
-int bioem_RefMap::read_float(float *currfloat, FILE *fin, int swap) {
+int bioem_RefMap::read_float(float *currfloat, FILE *fin, int swap)
+{
   unsigned char *cptr, tmp;
 
-  if (fread(currfloat,4,1,fin)!=1) return 0;
-  if (swap == 1) {
-    cptr = (unsigned char *)currfloat;
+  if (fread(currfloat, 4, 1, fin) != 1)
+    return 0;
+  if (swap == 1)
+  {
+    cptr = (unsigned char *) currfloat;
     tmp = cptr[0];
-    cptr[0]=cptr[3];
-    cptr[3]=tmp;
+    cptr[0] = cptr[3];
+    cptr[3] = tmp;
     tmp = cptr[1];
-    cptr[1]=cptr[2];
-    cptr[2]=tmp;
+    cptr[1] = cptr[2];
+    cptr[2] = tmp;
   }
   return 1;
 }
 
-int  bioem_RefMap::read_int(int *currlong, FILE *fin, int swap) {
+int bioem_RefMap::read_int(int *currlong, FILE *fin, int swap)
+{
   unsigned char *cptr, tmp;
 
-  if (fread(currlong,4,1,fin)!=1) return 0;
-  if (swap == 1) {
-    cptr = (unsigned char *)currlong;
+  if (fread(currlong, 4, 1, fin) != 1)
+    return 0;
+  if (swap == 1)
+  {
+    cptr = (unsigned char *) currlong;
     tmp = cptr[0];
-    cptr[0]=cptr[3];
-    cptr[3]=tmp;
+    cptr[0] = cptr[3];
+    cptr[3] = tmp;
     tmp = cptr[1];
-    cptr[1]=cptr[2];
-    cptr[2]=tmp;
+    cptr[1] = cptr[2];
+    cptr[2] = tmp;
   }
   return 1;
 }
-int  bioem_RefMap::read_float_empty (FILE *fin) {
+int bioem_RefMap::read_float_empty(FILE *fin)
+{
   float currfloat;
 
-  if (fread(&currfloat,4,1,fin)!=1) return 0;
+  if (fread(&currfloat, 4, 1, fin) != 1)
+    return 0;
   return 1;
 }
 
-int  bioem_RefMap::read_char_float (float *currfloat, FILE *fin) {
+int bioem_RefMap::read_char_float(float *currfloat, FILE *fin)
+{
   char currchar;
 
-  if (fread(&currchar,1,1,fin)!=1) return 0;
-  *currfloat=(float)currchar;
+  if (fread(&currchar, 1, 1, fin) != 1)
+    return 0;
+  *currfloat = (float) currchar;
   return 1;
 }
 
-int  bioem_RefMap::test_mrc (const char *vol_file, int swap) {
+int bioem_RefMap::test_mrc(const char *vol_file, int swap)
+{
   FILE *fin;
   int nc, nr, ns, mx, my, mz;
   int mode, ncstart, nrstart, nsstart;
@@ -618,56 +681,71 @@ int  bioem_RefMap::test_mrc (const char *vol_file, int swap) {
   float dmin, dmax, dmean, dummy, xorigin, yorigin, zorigin;
 
   fin = fopen(vol_file, "rb");
-  if( fin == NULL ) {
+  if (fin == NULL)
+  {
     cout << "ERROR opening MRC: " << vol_file;
     exit(1);
   }
 
   //* read header info
-  header_ok *= read_int(&nc,fin,swap);
-  header_ok *= read_int(&nr,fin,swap);
-  header_ok *= read_int(&ns,fin,swap);
-  header_ok *= read_int(&mode,fin,swap);
-  header_ok *= read_int(&ncstart,fin,swap);
-  header_ok *= read_int(&nrstart,fin,swap);
-  header_ok *= read_int(&nsstart,fin,swap);
-  header_ok *= read_int(&mx,fin,swap);
-  header_ok *= read_int(&my,fin,swap);
-  header_ok *= read_int(&mz,fin,swap);
-  header_ok *= read_float(&xlen,fin,swap);
-  header_ok *= read_float(&ylen,fin,swap);
-  header_ok *= read_float(&zlen,fin,swap);
-  header_ok *= read_float(&alpha,fin,swap);
-  header_ok *= read_float(&beta,fin,swap);
-  header_ok *= read_float(&gamma,fin,swap);
-  header_ok *= read_int(&mapc,fin,swap);
-  header_ok *= read_int(&mapr,fin,swap);
-  header_ok *= read_int(&maps_local,fin,swap);
-  header_ok *= read_float(&dmin,fin,swap);
-  header_ok *= read_float(&dmax,fin,swap);
-  header_ok *= read_float(&dmean,fin,swap);
-  for (i=23; i<50; ++i) header_ok *= read_float(&dummy,fin,swap);
-  header_ok *= read_float(&xorigin,fin,swap);
-  header_ok *= read_float(&yorigin,fin,swap);
-  header_ok *= read_float(&zorigin,fin,swap);
-  fclose (fin);
-  if (header_ok == 0) {
+  header_ok *= read_int(&nc, fin, swap);
+  header_ok *= read_int(&nr, fin, swap);
+  header_ok *= read_int(&ns, fin, swap);
+  header_ok *= read_int(&mode, fin, swap);
+  header_ok *= read_int(&ncstart, fin, swap);
+  header_ok *= read_int(&nrstart, fin, swap);
+  header_ok *= read_int(&nsstart, fin, swap);
+  header_ok *= read_int(&mx, fin, swap);
+  header_ok *= read_int(&my, fin, swap);
+  header_ok *= read_int(&mz, fin, swap);
+  header_ok *= read_float(&xlen, fin, swap);
+  header_ok *= read_float(&ylen, fin, swap);
+  header_ok *= read_float(&zlen, fin, swap);
+  header_ok *= read_float(&alpha, fin, swap);
+  header_ok *= read_float(&beta, fin, swap);
+  header_ok *= read_float(&gamma, fin, swap);
+  header_ok *= read_int(&mapc, fin, swap);
+  header_ok *= read_int(&mapr, fin, swap);
+  header_ok *= read_int(&maps_local, fin, swap);
+  header_ok *= read_float(&dmin, fin, swap);
+  header_ok *= read_float(&dmax, fin, swap);
+  header_ok *= read_float(&dmean, fin, swap);
+  for (i = 23; i < 50; ++i)
+    header_ok *= read_float(&dummy, fin, swap);
+  header_ok *= read_float(&xorigin, fin, swap);
+  header_ok *= read_float(&yorigin, fin, swap);
+  header_ok *= read_float(&zorigin, fin, swap);
+  fclose(fin);
+  if (header_ok == 0)
+  {
     cout << "ERROR reading MRC header: " << vol_file;
     exit(1);
   }
 
-  n_range_viols += (nc>5000); n_range_viols += (nc<0);
-  n_range_viols += (nr>5000); n_range_viols += (nr<0);
-  n_range_viols += (ns>5000); n_range_viols += (ns<0);
-  n_range_viols += (ncstart>5000); n_range_viols += (ncstart<-5000);
-  n_range_viols += (nrstart>5000); n_range_viols += (nrstart<-5000);
-  n_range_viols += (nsstart>5000); n_range_viols += (nsstart<-5000);
-  n_range_viols += (mx>5000); n_range_viols += (mx<0);
-  n_range_viols += (my>5000); n_range_viols += (my<0);
-  n_range_viols += (mz>5000); n_range_viols += (mz<0);
-  n_range_viols += (alpha>360.0f); n_range_viols += (alpha<-360.0f);
-  n_range_viols += (beta>360.0f); n_range_viols += (beta<-360.0f);
-  n_range_viols += (gamma>360.0f); n_range_viols += (gamma<-360.0f);
+  n_range_viols += (nc > 5000);
+  n_range_viols += (nc < 0);
+  n_range_viols += (nr > 5000);
+  n_range_viols += (nr < 0);
+  n_range_viols += (ns > 5000);
+  n_range_viols += (ns < 0);
+  n_range_viols += (ncstart > 5000);
+  n_range_viols += (ncstart < -5000);
+  n_range_viols += (nrstart > 5000);
+  n_range_viols += (nrstart < -5000);
+  n_range_viols += (nsstart > 5000);
+  n_range_viols += (nsstart < -5000);
+  n_range_viols += (mx > 5000);
+  n_range_viols += (mx < 0);
+  n_range_viols += (my > 5000);
+  n_range_viols += (my < 0);
+  n_range_viols += (mz > 5000);
+  n_range_viols += (mz < 0);
+  n_range_viols += (alpha > 360.0f);
+  n_range_viols += (alpha < -360.0f);
+  n_range_viols += (beta > 360.0f);
+  n_range_viols += (beta < -360.0f);
+  n_range_viols += (gamma > 360.0f);
+  n_range_viols += (gamma < -360.0f);
 
   return n_range_viols;
 }
diff --git a/model.cpp b/model.cpp
index 2e16b68e9c9369ccd3c79510467c1bfdd5b1d645..a0a42163b26800a2742a413dc68d8fbedf121250 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1,317 +1,389 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
+   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
         Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cstring>
 
 #include "model.h"
 #include "param.h"
 
 using namespace std;
 
-bioem_model::bioem_model()
-{
-  points = NULL;
-}
+bioem_model::bioem_model() { points = NULL; }
 
 bioem_model::~bioem_model()
 {
-  if (points) free(points);
+  if (points)
+    free(points);
 }
 
-int bioem_model::readModel(bioem_param& param, const char* filemodel)
+int bioem_model::readModel(bioem_param &param, const char *filemodel)
 {
   // **************************************************************************************
-  // ***************Reading reference Models either PDB or x,y,z,r,d format****************
+  // ***************Reading reference Models either PDB or x,y,z,r,d
+  // format****************
   // **************************************************************************************
 
   ofstream exampleReadCoor;
-  exampleReadCoor.open ("COORDREAD");
+  exampleReadCoor.open("COORDREAD");
 
-  exampleReadCoor << "Text --- Number ---- x ---- y ---- z ---- radius ---- number of electron\n";
-	
+  exampleReadCoor << "Text --- Number ---- x ---- y ---- z ---- radius ---- "
+                     "number of electron\n";
   int allocsize = 0;
 
   std::ifstream input(filemodel);
-  if(readPDB)
+  if (readPDB)
+  {
+    //************** Reading PDB files **********************
+
+    ifstream input(filemodel);
+    if (!input.good())
     {
-      //************** Reading PDB files **********************
+      cout << "PDB Failed to open file"
+           << endl; // pdbfilename << " ("<<filename<<")\n";
+      exit(1);
+    }
 
-      ifstream input(filemodel);
-      if (!input.good())
-	{
-	  cout << "PDB Failed to open file" << endl ; // pdbfilename << " ("<<filename<<")\n";
-	  exit(1);
-	}
+    char line[512] = {0};
+    char tmpLine[512] = {0};
+    int numres = 0;
+    NormDen = 0.0;
 
-      char line[512] = {0};
-      char tmpLine[512] = {0};
-      int numres = 0;
-      NormDen = 0.0;
+    string strfilename(filemodel);
 
-      string strfilename(filemodel);
+    size_t foundpos = strfilename.find(".pdb");
+    size_t endpos = strfilename.find_last_not_of(" \t");
 
-      size_t foundpos= strfilename.find(".pdb");
-      size_t endpos = strfilename.find_last_not_of(" \t");
+    if (foundpos > endpos)
+    {
+      cout << "Warining:::: .pdb extension NOT dectected in file name \n";
+      cout << "Warining::::  Are you sure you want to read a PDB? \n";
+    }
 
-      if(foundpos > endpos){
-	cout << "Warining:::: .pdb extension NOT dectected in file name \n";
-	cout << "Warining::::  Are you sure you want to read a PDB? \n";
-      }
+    //  cout << " HERE	" << filemodel ;
+    // for eachline in the file
+    while (!input.eof())
+    {
+      input.getline(line, 511);
 
+      strncpy(tmpLine, line, strlen(line));
+      char *token = strtok(tmpLine, " ");
 
-      //  cout << " HERE	" << filemodel ;
-      // for eachline in the file
-      while (!input.eof())
-	{
-	  input.getline(line, 511);
-
-	  strncpy(tmpLine, line, strlen(line));
-	  char *token = strtok(tmpLine, " ");
-
-	  if (strcmp(token, "ATOM") == 0) // Optional,Mandatory if standard residues exist
-	    {
-	      /*
-		1-6 			"ATOM  "
-		7 - 11		 Integer		 serial		Atom serial number.
-		13 - 16		Atom			name		  Atom name.
-		17			 Character	   altLoc		Alternate location indicator.
-		18 - 20		Residue name	resName	   Residue name.
-		22			 Character	   chainID	   Chain identifier.
-		23 - 26		Integer		 resSeq		Residue sequence number.
-		27			 AChar		   iCode		 Code for insertion of residues.
-		31 - 38		Real(8.3)	   x			 Orthogonal coordinates for X in
-		39 - 46		Real(8.3)	   y			 Orthogonal coordinates for Y in
-		47 - 54		Real(8.3)	   z			 Orthogonal coordinates for Z in
-	      */
-
-	      char name[5] = {0};
-	      char resName[4] = {0};
-	      float x = 0.0;
-	      float y = 0.0;
-	      float z = 0.0;
-	      char tmp[6] = {0};
-
-	      // parse name
-	      strncpy(tmp, line + 12, 4);
-	      sscanf (tmp, "%s", name);
-
-	      // parse resName
-	      strncpy(tmp, line + 17, 3);
-	      sscanf (tmp, "%s", resName);
-
-	      // parse x, y, z
-	      char tmpVals[36]  = {0};
-
-	      strncpy (tmpVals, line + 30, 8);
-	      sscanf (tmpVals, "%f", &x);
-
-	      strncpy (tmpVals, line + 38, 8);
-	      sscanf (tmpVals, "%f", &y);
-
-	      strncpy (tmpVals, line + 46, 8);
-	      sscanf (tmpVals, "%f", &z);
-
-	      if (strcmp(name, "CA") == 0)
-		{
-		  if (allocsize == 0)
-		    {
-		      allocsize = 64;
-		      points = (bioem_model_point*) mallocchk(sizeof(bioem_model_point) * allocsize);
-		    }
-		  else if (numres + 1 >= allocsize)
-		    {
-		      allocsize *= 2;
-		      points = (bioem_model_point*) reallocchk(points, sizeof(bioem_model_point) * allocsize);
-		    }
-					
-		  //Getting residue Radius and electron density
-		  points[numres].radius = getAminoAcidRad(resName);
-		  points[numres].density = getAminoAcidDensity(resName);
-		  NormDen += points[numres].density;
-
-		  //Getting the coordinates
-		  points[numres].point.pos[0] = (myfloat_t) x;
-		  points[numres].point.pos[1] = (myfloat_t) y;
-		  points[numres].point.pos[2] = (myfloat_t) z;
-		  exampleReadCoor << "RESIDUE " << numres << " " << points[numres].point.pos[0] << " " <<  points[numres].point.pos[1] << " " << points[numres].point.pos[2] << " " << points[numres].radius  << " " << points[numres].density << "\n";
-		  numres++;
-		}
-	    }
-
-
-	}
-      nPointsModel = numres;
-      cout << "Protein structure read from PDB\n";
+      if (strcmp(token, "ATOM") ==
+          0) // Optional,Mandatory if standard residues exist
+      {
+        /*
+          1-6 			"ATOM  "
+          7 - 11		 Integer		 serial		Atom
+          serial
+          number.
+          13 - 16		Atom			name		  Atom
+          name.
+          17			 Character	   altLoc Alternate
+          location indicator.
+          18 - 20		Residue name	resName	   Residue name.
+          22			 Character	   chainID	   Chain
+          identifier.
+          23 - 26		Integer		 resSeq		Residue
+          sequence number.
+          27			 AChar		   iCode		 Code
+          for
+          insertion of residues.
+          31 - 38		Real(8.3)	   x Orthogonal
+          coordinates for X in
+          39 - 46		Real(8.3)	   y Orthogonal
+          coordinates for Y in
+          47 - 54		Real(8.3)	   z Orthogonal
+          coordinates for Z in
+        */
+
+        char name[5] = {0};
+        char resName[4] = {0};
+        float x = 0.0;
+        float y = 0.0;
+        float z = 0.0;
+        char tmp[6] = {0};
+
+        // parse name
+        strncpy(tmp, line + 12, 4);
+        sscanf(tmp, "%s", name);
+
+        // parse resName
+        strncpy(tmp, line + 17, 3);
+        sscanf(tmp, "%s", resName);
+
+        // parse x, y, z
+        char tmpVals[36] = {0};
+
+        strncpy(tmpVals, line + 30, 8);
+        sscanf(tmpVals, "%f", &x);
+
+        strncpy(tmpVals, line + 38, 8);
+        sscanf(tmpVals, "%f", &y);
+
+        strncpy(tmpVals, line + 46, 8);
+        sscanf(tmpVals, "%f", &z);
+
+        if (strcmp(name, "CA") == 0)
+        {
+          if (allocsize == 0)
+          {
+            allocsize = 64;
+            points = (bioem_model_point *) mallocchk(sizeof(bioem_model_point) *
+                                                     allocsize);
+          }
+          else if (numres + 1 >= allocsize)
+          {
+            allocsize *= 2;
+            points = (bioem_model_point *) reallocchk(
+                points, sizeof(bioem_model_point) * allocsize);
+          }
+
+          // Getting residue Radius and electron density
+          points[numres].radius = getAminoAcidRad(resName);
+          points[numres].density = getAminoAcidDensity(resName);
+          NormDen += points[numres].density;
+
+          // Getting the coordinates
+          points[numres].point.pos[0] = (myfloat_t) x;
+          points[numres].point.pos[1] = (myfloat_t) y;
+          points[numres].point.pos[2] = (myfloat_t) z;
+          exampleReadCoor << "RESIDUE " << numres << " "
+                          << points[numres].point.pos[0] << " "
+                          << points[numres].point.pos[1] << " "
+                          << points[numres].point.pos[2] << " "
+                          << points[numres].radius << " "
+                          << points[numres].density << "\n";
+          numres++;
+        }
+      }
     }
-  else //Reading model from FILE FORMAT x,y,z,rad,density
-    {
-      //**************** Reading Text FILES ***********************
+    nPointsModel = numres;
+    cout << "Protein structure read from PDB\n";
+  }
+  else // Reading model from FILE FORMAT x,y,z,rad,density
+  {
+    //**************** Reading Text FILES ***********************
 
-      char line[128];
-      int numres = 0;
-      NormDen = 0.0;
+    char line[128];
+    int numres = 0;
+    NormDen = 0.0;
 
-      string strfilename(filemodel);
+    string strfilename(filemodel);
 
-      size_t foundpos= strfilename.find(".pdb");
-      size_t endpos = strfilename.find_last_not_of(" \t");
+    size_t foundpos = strfilename.find(".pdb");
+    size_t endpos = strfilename.find_last_not_of(" \t");
+
+    if (foundpos < endpos)
+    {
+      cout << "Warining:::: .pdb dectected in file name whilst using text read "
+              "\n";
+      cout << "Warining::::  Are you sure you do not need --ReadPDB? \n";
+      cout << "If so then you must include the keyword IGNORE_PDB in "
+              "inputfile\n";
+      if (not param.ignorePDB)
+        exit(1);
+    }
 
-      if(foundpos < endpos){
-	cout << "Warining:::: .pdb dectected in file name whilst using text read \n";
-	cout << "Warining::::  Are you sure you do not need --ReadPDB? \n";
-	cout << "If so then you must include the keyword IGNORE_PDB in inputfile\n";
-	if(not param.ignorePDB)exit(1);
+    FILE *file = fopen(filemodel, "r");
+    if (file == NULL)
+    {
+      cout << "Error opening file " << filemodel << "\n";
+      exit(1);
+    }
+    while (fgets(line, sizeof line, file) != NULL)
+    {
+      if (allocsize == 0)
+      {
+        allocsize = 64;
+        points = (bioem_model_point *) mallocchk(sizeof(bioem_model_point) *
+                                                 allocsize);
+      }
+      else if (numres + 1 >= allocsize)
+      {
+        allocsize *= 2;
+        points = (bioem_model_point *) reallocchk(
+            points, sizeof(bioem_model_point) * allocsize);
       }
-		
-      FILE *file = fopen ( filemodel , "r" );
-      if (file == NULL)
-	{
-	  cout << "Error opening file " << filemodel << "\n";
-	  exit(1);
-	}
-      while ( fgets ( line, sizeof line, file ) != NULL )
-	{
-	  if (allocsize == 0)
-	    {
-	      allocsize = 64;
-	      points = (bioem_model_point*) mallocchk(sizeof(bioem_model_point) * allocsize);
-	    }
-	  else if (numres + 1 >= allocsize)
-	    {
-	      allocsize *= 2;
-	      points = (bioem_model_point*) reallocchk(points, sizeof(bioem_model_point) * allocsize);
-	    }
-					
-	  float tmpval[5];
-	  sscanf(line, "%f %f %f %f %f", &tmpval[0], &tmpval[1], &tmpval[2], &tmpval[3], &tmpval[4]);
-	  points[numres].point.pos[0] = (myfloat_t) tmpval[0];
-	  points[numres].point.pos[1] = (myfloat_t) tmpval[1];
-	  points[numres].point.pos[2] = (myfloat_t) tmpval[2];
-	  points[numres].radius = (myfloat_t) tmpval[3];
-	  points[numres].density = (myfloat_t) tmpval[4];
-
-	  exampleReadCoor << "RESIDUE " << numres << " " << points[numres].point.pos[0] << " " <<  points[numres].point.pos[1] << " " << points[numres].point.pos[2] << " " << points[numres].radius  << " " << points[numres].density << "\n";
-	  NormDen += points[numres].density;
-	  numres++;
-	}
-      fclose(file);
-      nPointsModel = numres;
-      cout << "Protein structure read from Standard File\n";
+
+      float tmpval[5];
+      sscanf(line, "%f %f %f %f %f", &tmpval[0], &tmpval[1], &tmpval[2],
+             &tmpval[3], &tmpval[4]);
+      points[numres].point.pos[0] = (myfloat_t) tmpval[0];
+      points[numres].point.pos[1] = (myfloat_t) tmpval[1];
+      points[numres].point.pos[2] = (myfloat_t) tmpval[2];
+      points[numres].radius = (myfloat_t) tmpval[3];
+      points[numres].density = (myfloat_t) tmpval[4];
+
+      exampleReadCoor << "RESIDUE " << numres << " "
+                      << points[numres].point.pos[0] << " "
+                      << points[numres].point.pos[1] << " "
+                      << points[numres].point.pos[2] << " "
+                      << points[numres].radius << " " << points[numres].density
+                      << "\n";
+      NormDen += points[numres].density;
+      numres++;
     }
-  points = (bioem_model_point*) reallocchk(points, sizeof(bioem_model_point) * nPointsModel);
-  cout << "Total Number of Voxels " << nPointsModel ;
-  cout << "\nTotal Number of Electrons "  << NormDen ; 
+    fclose(file);
+    nPointsModel = numres;
+    cout << "Protein structure read from Standard File\n";
+  }
+  points = (bioem_model_point *) reallocchk(points, sizeof(bioem_model_point) *
+                                                        nPointsModel);
+  cout << "Total Number of Voxels " << nPointsModel;
+  cout << "\nTotal Number of Electrons " << NormDen;
   cout << "\n+++++++++++++++++++++++++++++++++++++++++ \n";
-  exampleReadCoor.close();
 
+  exampleReadCoor.close();
   //******************** Moving to Model to its center of density mass:
   myfloat3_t r_cm;
 
-  if(not(param.nocentermass)){ //by default it is normally done
+  if (not(param.nocentermass))
+  { // by default it is normally done
 
-    for(int n = 0; n < 3; n++)r_cm.pos[n] = 0.0;
+    for (int n = 0; n < 3; n++)
+      r_cm.pos[n] = 0.0;
 
-    for(int n = 0; n < nPointsModel; n++)
-      {
-	r_cm.pos[0] += points[n].point.pos[0]*points[n].density;
-	r_cm.pos[1] += points[n].point.pos[1]*points[n].density;
-	r_cm.pos[2] += points[n].point.pos[2]*points[n].density;
-      }
-    r_cm.pos[0] = r_cm.pos[0] / NormDen ; 
-    r_cm.pos[1] = r_cm.pos[1] / NormDen ; 
+    for (int n = 0; n < nPointsModel; n++)
+    {
+      r_cm.pos[0] += points[n].point.pos[0] * points[n].density;
+      r_cm.pos[1] += points[n].point.pos[1] * points[n].density;
+      r_cm.pos[2] += points[n].point.pos[2] * points[n].density;
+    }
+    r_cm.pos[0] = r_cm.pos[0] / NormDen;
+    r_cm.pos[1] = r_cm.pos[1] / NormDen;
     r_cm.pos[2] = r_cm.pos[2] / NormDen;
 
-    for(int n = 0; n < nPointsModel; n++)
-      {
-	points[n].point.pos[0] -= r_cm.pos[0];
-	points[n].point.pos[1] -= r_cm.pos[1];
-	points[n].point.pos[2] -= r_cm.pos[2];
-
-      }
+    for (int n = 0; n < nPointsModel; n++)
+    {
+      points[n].point.pos[0] -= r_cm.pos[0];
+      points[n].point.pos[1] -= r_cm.pos[1];
+      points[n].point.pos[2] -= r_cm.pos[2];
+    }
   }
-  return(0);
+  return (0);
 }
 
 myfloat_t bioem_model::getAminoAcidRad(char *name)
 {
-  // *************** Function that gets the radius for each amino acid ****************
+  // *************** Function that gets the radius for each amino acid
+  // ****************
   myfloat_t iaa = 0;
 
-  if(std::strcmp(name, "CYS") == 0)iaa = 2.75;
-  else if(std::strcmp(name, "PHE") == 0)iaa = 3.2;
-  else if(std::strcmp(name, "LEU") == 0)iaa = 3.1;
-  else if(std::strcmp(name, "TRP") == 0)iaa = 3.4;
-  else if(std::strcmp(name, "VAL") == 0)iaa = 2.95;
-  else if(std::strcmp(name, "ILE") == 0)iaa = 3.1;
-  else if(std::strcmp(name, "MET") == 0)iaa = 3.1;
-  else if(std::strcmp(name, "HIS") == 0)iaa = 3.05;
-  else if(std::strcmp(name, "TYR") == 0)iaa = 3.25;
-  else if(std::strcmp(name, "ALA") == 0)iaa = 2.5;
-  else if(std::strcmp(name, "GLY") == 0)iaa = 2.25;
-  else if(std::strcmp(name, "PRO") == 0)iaa = 2.8;
-  else if(std::strcmp(name, "ASN") == 0)iaa = 2.85;
-  else if(std::strcmp(name, "THR") == 0)iaa = 2.8;
-  else if(std::strcmp(name, "SER") == 0)iaa = 2.6;
-  else if(std::strcmp(name, "ARG") == 0)iaa = 3.3;
-  else if(std::strcmp(name, "GLN") == 0)iaa = 3.0;
-  else if(std::strcmp(name, "ASP") == 0)iaa = 2.8;
-  else if(std::strcmp(name, "LYS") == 0)iaa = 3.2;
-  else if(std::strcmp(name, "GLU") == 0)iaa = 2.95;
-
-  if(iaa == 0)
-    {
-      cout << "PROBLEM WITH AMINO ACID " << name << endl;
-      exit(1);
-    }
+  if (std::strcmp(name, "CYS") == 0)
+    iaa = 2.75;
+  else if (std::strcmp(name, "PHE") == 0)
+    iaa = 3.2;
+  else if (std::strcmp(name, "LEU") == 0)
+    iaa = 3.1;
+  else if (std::strcmp(name, "TRP") == 0)
+    iaa = 3.4;
+  else if (std::strcmp(name, "VAL") == 0)
+    iaa = 2.95;
+  else if (std::strcmp(name, "ILE") == 0)
+    iaa = 3.1;
+  else if (std::strcmp(name, "MET") == 0)
+    iaa = 3.1;
+  else if (std::strcmp(name, "HIS") == 0)
+    iaa = 3.05;
+  else if (std::strcmp(name, "TYR") == 0)
+    iaa = 3.25;
+  else if (std::strcmp(name, "ALA") == 0)
+    iaa = 2.5;
+  else if (std::strcmp(name, "GLY") == 0)
+    iaa = 2.25;
+  else if (std::strcmp(name, "PRO") == 0)
+    iaa = 2.8;
+  else if (std::strcmp(name, "ASN") == 0)
+    iaa = 2.85;
+  else if (std::strcmp(name, "THR") == 0)
+    iaa = 2.8;
+  else if (std::strcmp(name, "SER") == 0)
+    iaa = 2.6;
+  else if (std::strcmp(name, "ARG") == 0)
+    iaa = 3.3;
+  else if (std::strcmp(name, "GLN") == 0)
+    iaa = 3.0;
+  else if (std::strcmp(name, "ASP") == 0)
+    iaa = 2.8;
+  else if (std::strcmp(name, "LYS") == 0)
+    iaa = 3.2;
+  else if (std::strcmp(name, "GLU") == 0)
+    iaa = 2.95;
+
+  if (iaa == 0)
+  {
+    cout << "PROBLEM WITH AMINO ACID " << name << endl;
+    exit(1);
+  }
   return iaa;
-
 }
 
 myfloat_t bioem_model::getAminoAcidDensity(char *name)
 {
-  // *************** Function that gets the number of electrons for each amino acid ****************
+  // *************** Function that gets the number of electrons for each amino
+  // acid ****************
   myfloat_t iaa = 0.0;
 
-  if(std::strcmp(name, "CYS") == 0)iaa = 64.0;
-  else if(std::strcmp(name, "PHE") == 0)iaa = 88.0;
-  else if(std::strcmp(name, "LEU") == 0)iaa = 72.0;
-  else if(std::strcmp(name, "TRP") == 0)iaa = 108.0;
-  else if(std::strcmp(name, "VAL") == 0)iaa = 64.0;
-  else if(std::strcmp(name, "ILE") == 0)iaa = 72.0;
-  else if(std::strcmp(name, "MET") == 0)iaa = 80.0;
-  else if(std::strcmp(name, "HIS") == 0)iaa = 82.0;
-  else if(std::strcmp(name, "TYR") == 0)iaa = 96.0;
-  else if(std::strcmp(name, "ALA") == 0)iaa = 48.0;
-  else if(std::strcmp(name, "GLY") == 0)iaa = 40.0;
-  else if(std::strcmp(name, "PRO") == 0)iaa = 62.0;
-  else if(std::strcmp(name, "ASN") == 0)iaa = 66.0;
-  else if(std::strcmp(name, "THR") == 0)iaa = 64.0;
-  else if(std::strcmp(name, "SER") == 0)iaa = 56.0;
-  else if(std::strcmp(name, "ARG") == 0)iaa = 93.0;
-  else if(std::strcmp(name, "GLN") == 0)iaa = 78.0;
-  else if(std::strcmp(name, "ASP") == 0)iaa = 59.0;
-  else if(std::strcmp(name, "LYS") == 0)iaa = 79.0;
-  else if(std::strcmp(name, "GLU") == 0)iaa = 53.0;
-
-  if(iaa == 0.0)
-    {
-      cout << "PROBLEM WITH AMINO ACID " << name << endl;
-      exit(1);
-    }
+  if (std::strcmp(name, "CYS") == 0)
+    iaa = 64.0;
+  else if (std::strcmp(name, "PHE") == 0)
+    iaa = 88.0;
+  else if (std::strcmp(name, "LEU") == 0)
+    iaa = 72.0;
+  else if (std::strcmp(name, "TRP") == 0)
+    iaa = 108.0;
+  else if (std::strcmp(name, "VAL") == 0)
+    iaa = 64.0;
+  else if (std::strcmp(name, "ILE") == 0)
+    iaa = 72.0;
+  else if (std::strcmp(name, "MET") == 0)
+    iaa = 80.0;
+  else if (std::strcmp(name, "HIS") == 0)
+    iaa = 82.0;
+  else if (std::strcmp(name, "TYR") == 0)
+    iaa = 96.0;
+  else if (std::strcmp(name, "ALA") == 0)
+    iaa = 48.0;
+  else if (std::strcmp(name, "GLY") == 0)
+    iaa = 40.0;
+  else if (std::strcmp(name, "PRO") == 0)
+    iaa = 62.0;
+  else if (std::strcmp(name, "ASN") == 0)
+    iaa = 66.0;
+  else if (std::strcmp(name, "THR") == 0)
+    iaa = 64.0;
+  else if (std::strcmp(name, "SER") == 0)
+    iaa = 56.0;
+  else if (std::strcmp(name, "ARG") == 0)
+    iaa = 93.0;
+  else if (std::strcmp(name, "GLN") == 0)
+    iaa = 78.0;
+  else if (std::strcmp(name, "ASP") == 0)
+    iaa = 59.0;
+  else if (std::strcmp(name, "LYS") == 0)
+    iaa = 79.0;
+  else if (std::strcmp(name, "GLU") == 0)
+    iaa = 53.0;
+
+  if (iaa == 0.0)
+  {
+    cout << "PROBLEM WITH AMINO ACID " << name << endl;
+    exit(1);
+  }
   return iaa;
 }
-
diff --git a/param.cpp b/param.cpp
index a6437bd6f3fb3b42d27caa604432709581cb576a..ec5d1515d8e2ee58b2ccfc2f57d6b6bf70926fe1 100644
--- a/param.cpp
+++ b/param.cpp
@@ -1,47 +1,48 @@
 /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    < BioEM software for Bayesian inference of Electron Microscopy images>
-   Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, 
-        Volker Lindenstruth and Gerhard Hummer.
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
    Max Planck Institute of Biophysics, Frankfurt, Germany.
-   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany.
-   Max Planck Computing and Data Facility, Garching, Germany. 
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
 
-   Released under the GNU Public License, v3. 
+   Released under the GNU Public License, v3.
    See license statement for terms of distribution.
 
    ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <fstream>
 #include <cstring>
-#include <math.h>
 #include <fftw3.h>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #ifdef WITH_OPENMP
 #include <omp.h>
 #endif
 
-#include "param.h"
 #include "map.h"
+#include "param.h"
 
 using namespace std;
 
 bioem_param::bioem_param()
 {
- 
+
   //**************** Initializing Variables and defaults ****************
 
-  //Number of Pixels
+  // Number of Pixels
   param_device.NumberPixels = 0;
   param_device.NumberFFTPixels1D = 0;
   // Euler angle grid spacing
   angleGridPointsAlpha = 0;
   angleGridPointsBeta = 0;
-  //Envelop function paramters
+  // Envelop function paramters
   numberGridPointsEnvelop = 0;
-  //Contrast transfer function paramters
+  // Contrast transfer function paramters
   numberGridPointsCTF_amp = 0;
   numberGridPointsCTF_phase = 0;
 
@@ -57,61 +58,60 @@ bioem_param::bioem_param()
   angprior = NULL;
 
   printModel = false;
+  BestmapCalcCC = false;
 }
 
-int bioem_param::readParameters(const char* fileinput)
-{	// **************************************************************************************
-	// ***************************** Reading Input Parameters ******************************
-	// **************************************************************************************
+int bioem_param::readParameters(const char *fileinput)
+{ // **************************************************************************************
+  // ***************************** Reading Input Parameters
+  // ******************************
+  // **************************************************************************************
 
-	// Control for Parameters
+  // Control for Parameters
   bool yesPixSi = false;
   bool yesNumPix = false;
   bool yesGPal = false;
   bool yesGPbe = false;
-  bool yesMDC = false ;
-  bool yesBFact=false; 
-  bool yesDefocus=false;
-  bool yesAMP=false;
-  bool yesPSFenv=false;
-  bool yesPSFpha=false;
-  bool yesquatgrid=false;
+  bool yesMDC = false;
+  bool yesBFact = false;
+  bool yesDefocus = false;
+  bool yesAMP = false;
+  bool yesPSFenv = false;
+  bool yesPSFpha = false;
+  bool yesquatgrid = false;
 
   //***************** Default VALUES
-  param_device.flipped=false;
-  param_device.debugterm=false;
-  param_device.writeCC=false;
-  param_device.tousepsf=false;
-  param_device.CCwithBayes=true;
-  writeCTF=false;
-  elecwavel=0.019866;
-  ignoreCCoff=false;
-  doquater=false;
-  nocentermass=false;
-  printrotmod=false;  
-  readquatlist=false;
-  doaaradius=true;
-  notnormmap=false;
-  usepsf=false;
-  yespriorAngles=false; 
-  ignorepointsout=false;
-  printrotmod=false;
-  ignorePDB=false;
-
-  NotUn_angles=0;
-  priorMod=1; //Default
-  shiftX=0;
-  shiftY=0;
-  param_device.sigmaPriorbctf=100.;
-  param_device.sigmaPriordefo=1.0;
-  param_device.Priordefcent=3.0;
+  param_device.tousepsf = false;
+  writeCTF = false;
+  elecwavel = 0.019866;
+  doquater = false;
+  nocentermass = false;
+  printrotmod = false;
+  readquatlist = false;
+  doaaradius = true;
+  notnormmap = false;
+  usepsf = false;
+  yespriorAngles = false;
+  ignorepointsout = false;
+  printrotmod = false;
+  ignorePDB = false;
+
+  NotUn_angles = 0;
+  priorMod = 1; // Default
+  shiftX = 0;
+  shiftY = 0;
+  param_device.sigmaPriorbctf = 100.;
+  param_device.sigmaPriordefo = 2.0;
+  param_device.Priordefcent = 3.0;
+  param_device.sigmaPrioramp = 0.5;
+  param_device.Priorampcent = 0.;
 
   ifstream input(fileinput);
   if (!input.good())
-    {
-      cout << "Failed to open file: " << fileinput << "\n";
-      exit(1);
-    }
+  {
+    cout << "Failed to open file: " << fileinput << "\n";
+    exit(1);
+  }
 
   char line[512] = {0};
   char saveline[512];
@@ -119,1016 +119,1343 @@ int bioem_param::readParameters(const char* fileinput)
   cout << "\n +++++++++++++++++++++++++++++++++++++++++ \n";
   cout << "\n   READING BioEM PARAMETERS             \n\n";
   cout << " +++++++++++++++++++++++++++++++++++++++++ \n";
-	
-  while (!input.eof())
-    {
-      input.getline(line, 512);
-      strcpy(saveline, line);
-      char *token = strtok(line, " ");
-
-      if (token == NULL || line[0] == '#' || strlen(token) == 0)
-	{
-	  // comment or blank line
-	}
-      else if (strcmp(token, "PIXEL_SIZE") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  pixelSize = atof(token);
-	  if (pixelSize < 0 ) { cout << "*** Error: Negative pixelSize "; exit(1);}
-	  cout << "Pixel Sixe " << pixelSize << "\n";
-	  yesPixSi= true;
-	}
-      else if (strcmp(token, "NOT_SQUARE_IMAGE") == 0)
-        {
-	  notsqure=true;
-        }
-      else if (strcmp(token, "NUMBER_PIXELS") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  param_device.NumberPixels = int(atoi(token));
-	  if (param_device.NumberPixels < 0 ) { cout << "*** Error: Negative Number of Pixels "; exit(1);}
-	  cout << "Number of Pixels " << param_device.NumberPixels << "\n";
-	  yesNumPix= true ;
-	}
-      else if (strcmp(token, "GRIDPOINTS_ALPHA") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  angleGridPointsAlpha = int(atoi(token));
-	  if (angleGridPointsAlpha < 0 ) { cout << "*** Error: Negative GRIDPOINTS_ALPHA "; exit(1);}
-	  cout << "Grid points alpha " << angleGridPointsAlpha << "\n";
-	  yesGPal= true;
-	}
-      else if (strcmp(token, "GRIDPOINTS_BETA") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  angleGridPointsBeta = int(atoi(token));
-	  if (angleGridPointsBeta < 0 ) { cout << "*** Error: Negative GRIDPOINTS_BETA "; exit(1);}
-	  cout << "Grid points in Cosine ( beta ) " << angleGridPointsBeta << "\n";
-	  yesGPbe= true;
-	}
-      else if (strcmp(token, "USE_QUATERNIONS") == 0)
-	//        else if (token=="USE_QUATERNIONS")
-        {
-	  cout << "Orientations with Quaternions. \n";
-	  doquater= true;
-        }
-      else if (strcmp(token, "GRIDPOINTS_QUATERNION") == 0)
-        {
-          if(!notuniformangles){
-	    token = strtok(NULL, " ");
-	    GridPointsQuatern = int(atoi(token));
-	    cout << "Gridpoints Quaternions " << GridPointsQuatern << "\n";
-	  }else{
-	    cout << "Inconsitent Input: Grid or List with Quaternions?\n"; 
-	    exit(1);}
-	  yesquatgrid=true;
-	  doquater= true;
-        }
-      //CTF PARAMETERS
-      else if (strcmp(token, "CTF_B_ENV") == 0)
-        {
-          token = strtok(NULL, " ");
-          startBfactor = atof(token);
-          if (startBfactor < 0 ) { cout << "*** Error: Negative START B Env "; exit(1);}
-	  token = strtok(NULL, " ");
-          endBfactor = atof(token);
-          if (endBfactor < 0 ) { cout << "*** Error: Negative END B Env "; exit(1);}
-	  token = strtok(NULL, " ");
-          numberGridPointsEnvelop = int(atoi(token));
-          if (numberGridPointsEnvelop < 0 ) { cout << "*** Error: Negative Number of Grid points BEnv "; exit(1);}
-          cout << "Grid CTF B-ENV: " << startBfactor << " " << endBfactor <<" " << numberGridPointsEnvelop<< "\n";
-	  if(startBfactor > endBfactor){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-          yesBFact = true;
-        }
-      else if (strcmp(token,"CTF_DEFOCUS")==0)
-        {
-          token = strtok(NULL, " ");
-          startDefocus = atof(token);
-          if (startDefocus < 0 ) { cout << "*** Error: Negative START Defocus "; exit(1);}
-          token = strtok(NULL, " ");
-          endDefocus = atof(token);
-          if (endDefocus < 0 ) { cout << "*** Error: Negative END Defocus "; exit(1);}
-          token = strtok(NULL, " ");
-          numberGridPointsCTF_phase = int(atoi(token));
-          if (numberGridPointsCTF_phase < 0 ) { cout << "*** Error: Negative Number of Grid points Defocus "; exit(1);}
-          cout << "Grid CTF Defocus: " << startDefocus << " " << endDefocus <<" " << numberGridPointsCTF_phase << "\n";
-          if(startDefocus > endDefocus){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-          if(endDefocus> 8.){cout << "Defocus beyond 8micro-m range is not allowed \n"; exit(1);}
-	  yesDefocus = true;
-	}
-      else if (strcmp(token,"CTF_AMPLITUDE")==0)
-	{
-	  token = strtok(NULL, " ");
-	  startGridCTF_amp = atof(token);
-	  if (startGridCTF_amp < 0 ) { cout << "*** Error: Negative START Amplitude "; exit(1);}
-	  token = strtok(NULL, " ");
-	  endGridCTF_amp = atof(token);
-	  if (endGridCTF_amp < 0 ) { cout << "*** Error: Negative END Amplitude"; exit(1);}
-	  token = strtok(NULL, " ");
-	  numberGridPointsCTF_amp=int(atoi(token));
-	  if (numberGridPointsCTF_amp < 0 ) { cout << "*** Error: Negative Number of grid points amplitude "; exit(1);}
-	  cout << "Grid Amplitude: " << startGridCTF_amp << " " << endGridCTF_amp <<" " << numberGridPointsCTF_amp << "\n";
-	  if(startGridCTF_amp > endGridCTF_amp){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-	  yesAMP = true;
-	}
-      else if (strcmp(token,"ELECTRON_WAVELENGTH")==0)
-	{
-	  token = strtok(NULL," ");                
-	  elecwavel=atof(token);
-	  if(elecwavel < 0.0150 ){
-	    cout << "Wrong electron wave length " << elecwavel << "\n";
-	    cout << "Has to be in Angstrom (A)\n";
-	    exit(1);}
-	  cout << "Electron wave length in (A) is: " << elecwavel << "\n";
-	}
-      //PSF PARAMETERS
-      else if (strcmp(token, "USE_PSF") == 0)
-	{
-	  usepsf=true;
-          param_device.tousepsf=true;
-	  cout << "IMPORTANT: Using Point Spread Function. Thus, all parameters are in Real Space. \n";
-	}
-      else if (strcmp(token,"PSF_AMPLITUDE")==0)
-	{
-	  token = strtok(NULL, " ");
-	  startGridCTF_amp = atof(token);
-	  if (startGridCTF_amp < 0 ) { cout << "*** Error: Negative START Amplitude "; exit(1);}
-	  token = strtok(NULL, " ");
-	  endGridCTF_amp = atof(token);
-	  if (endGridCTF_amp < 0 ) { cout << "*** Error: Negative END Amplitude"; exit(1);}
-	  token = strtok(NULL, " ");
-	  numberGridPointsCTF_amp= int(atoi(token));
-	  if (numberGridPointsCTF_amp < 0 ) { cout << "*** Error: Negative Number of grid points amplitude "; exit(1);}
-	  cout << "Grid Amplitude: " << startGridCTF_amp << " " << endGridCTF_amp <<" " << numberGridPointsCTF_amp << "\n";
-	  if(startGridCTF_amp > endGridCTF_amp){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-	  yesAMP = true;
-	}
-      else if (strcmp(token,"PSF_ENVELOPE")==0)
-	{
-	  token = strtok(NULL, " ");
-	  startGridEnvelop = atof(token);
-	  if (startGridEnvelop < 0 ) { cout << "*** Error: Negative START PSF Env. "; exit(1);}
-	  token = strtok(NULL, " ");
-	  endGridEnvelop = atof(token);
-	  if (endGridEnvelop < 0 ) { cout << "*** Error: Negative END  PSF Env. "; exit(1);}
-	  token = strtok(NULL, " ");
-	  numberGridPointsEnvelop=int(atoi(token));
-	  if (numberGridPointsEnvelop < 0 ) { cout << "*** Error: Negative Number of grid points  PSF Env. "; exit(1);}
-	  cout << "Grid PSF Envelope: " << startGridEnvelop << " " << endGridEnvelop <<" " << numberGridPointsEnvelop << "\n";
-	  if(startGridEnvelop > endGridEnvelop){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-	  yesPSFenv = true;
-	}
-      else if (strcmp(token,"PSF_PHASE")==0)
-	{
-	  token = strtok(NULL, " ");
-	  startGridCTF_phase = atof(token);
-	  if (startGridCTF_phase < 0 ) { cout << "*** Error: Negative START Amplitud "; exit(1);}
-	  token = strtok(NULL, " ");
-	  endGridCTF_phase = atof(token);
-	  if (endGridCTF_phase < 0 ) { cout << "*** Error: Negative END Amplitud"; exit(1);}
-	  token = strtok(NULL, " ");
-	  numberGridPointsCTF_phase= int(atoi(token));
-	  if (numberGridPointsCTF_phase< 0 ) { cout << "*** Error: Negative Number of grid points amplitud "; exit(1);}
-	  cout << "Grid PSF phase: " << startGridCTF_phase << " " << endGridCTF_phase <<" " << numberGridPointsCTF_phase << "\n";
-	  if(startGridCTF_phase > endGridCTF_phase){ cout << "Error: Grid ill defined END > START\n"; exit(1);};
-	  yesPSFpha = true;
-	}
-      else if (strcmp(token, "DISPLACE_CENTER") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  param_device.maxDisplaceCenter = int(atoi(token));
-	  if (param_device.maxDisplaceCenter < 0 ) { cout << "*** Error: Negative MAX_D_CENTER "; exit(1);}
-	  cout << "Maximum displacement Center " <<  param_device.maxDisplaceCenter << "\n";
-	  token = strtok(NULL, " ");
-          param_device.GridSpaceCenter = int(atoi(token));
-          if (param_device.GridSpaceCenter < 0 ) { cout << "*** Error: Negative PIXEL_GRID_CENTER "; exit(1);}
-          cout << "Grid space displacement center " <<   param_device.GridSpaceCenter << "\n";
-	  yesMDC = true;
-	}
-      else if (strcmp(token, "WRITE_PROB_ANGLES") == 0) //Key word if writing down each angle probabilities
-	{
-	  param_device.writeAngles = true;
-	  cout << "Writing Probabilies of each angle \n";
-	}
-      else if (strcmp(token, "WRITE_CROSSCOR") == 0)//Key word if writing down full micrograph cross correlation
-	{
-	  param_device.writeCC = true;
-	  param_device.CCdisplace=10;
-	  cout << "Writing CrossCorrelations every 10 pixels\n";
-	}
-      else if (strcmp(token, "#CROSSCOR_GRID_SPACE") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  param_device.CCdisplace=int(atoi(token));
-	  if (param_device.CCdisplace < 0 ) { cout << "*** Error: Negative CROSSCOR_DISPLACE "; exit(1);}
-	  cout << "Writing Cross Correlation Displacement " <<  param_device.CCdisplace << "  \n";
-	}
-      else if (strcmp(token, "CROSSCOR_NOTBAYESIAN") == 0)
-	{
-	  param_device.CCwithBayes=false;
-	  cout << "Not using Bayesian Analysis to write Cross Correlation  \n";
-	}
-      else if (strcmp(token, "FLIPPED") == 0) //Key word if images are flipped for cross-correlation
-	{
-	  param_device.flipped = true;
-	  cout << "Micrograph Flipped Intensities \n";
-	}
-      else if (strcmp(token, "IGNORE_CROSSCORR_OFFSET") == 0) //Key word if images are flipped for cross-correlation
-	{
-	  ignoreCCoff = true;
-	  cout << "Ignoring Cross-Correlation offset \n";
-	}
-      else if (strcmp(token, "IGNORE_PDB") == 0) //Ignore PDB extension
-        {
-          ignorePDB = true;
-          cout << "Ignoring PDB extension in model file \n";
-        }
-      else if (strcmp(token, "NO_PROJECT_RADIUS") == 0) //If projecting CA with amino-acid radius
-	{
-	  doaaradius = false;
-	  cout << "Not Projecting corresponding radius \n";
-	}
-      else if (strcmp(token, "DEBUG_INDI_PROB_TERM") == 0)//writing out each term of the probability
-	{
-	  param_device.debugterm = true;
-	  cout << "Debugging Individual Probability Terms \n";
-	}
-      else if (strcmp(token, "WRITE_CTF_PARAM") == 0)//Number of Euler angle tripplets in non uniform Euler angle sampling
-	{
-	  writeCTF=true;
-	  token = strtok(NULL," ");
-	  cout << "Writing CTF parameters from PSF parameters that maximize the posterior. \n";
-	}
-      else if (strcmp(token, "NO_CENTEROFMASS") == 0)//Number of Euler angle tripplets in non uniform Euler angle sampling
-	{
-	  nocentermass=true;
-	  cout << "BE CAREFUL CENTER OF MASS IS NOT REMOVED \n Calculated images might be out of range \n";
-	}
-      else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0)//Number of Euler angle tripplets in non uniform Euler angle sampling
-	{
-	  printrotmod=true;
-	  cout << "PRINTING out rotatted models (best for debugging)\n";
-	}
-      else if (strcmp(token, "NO_MAP_NORM") == 0)
-	{
-	  notnormmap=true;
-	  cout << "NOT NORMALIZING MAP\n" ;
-	}
-      else if (strcmp(token, "PRIOR_MODEL") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  priorMod = atof(token);
-	  cout << "MODEL PRIOR Probability " << priorMod << "\n" ;
-	}
-      else if (strcmp(token, "PRIOR_ANGLES") == 0)
-	{
-	  yespriorAngles=true;
-	  cout << "READING Priors for Orientations in additonal orientation file\n" ;
-	}
-      else if (strcmp(token, "SHIFT_X") == 0)
-        {
-          token = strtok(NULL, " ");
-          shiftX=atoi(token);
-          cout << "Shifting initial model X by "<< shiftX << "\n" ;
-        }
-      else if (strcmp(token, "SHIFT_Y") == 0)
-        {
-          token = strtok(NULL, " ");
-          shiftY=atoi(token);
-          cout << "Shifting initial model Y by "<< shiftY << "\n" ;
-        }
-      else if (strcmp(token, "SIGMA_PRIOR_B_CTF") == 0)
-        {
-          token = strtok(NULL, " ");
-          param_device.sigmaPriorbctf=atof(token);
-          cout << "Chainging  Gaussian width in Prior of Envelope b parameter: " << param_device.sigmaPriorbctf << "\n";
-	}
-	  else if (strcmp(token, "SIGMA_PRIOR_DEFOCUS") == 0)
-        {
-          token = strtok(NULL, " ");
-          param_device.sigmaPriordefo=atof(token);
-          cout << "Gaussian Width in Prior of defocus parameter: " << param_device.sigmaPriordefo << "\n";
-        }
-  else if (strcmp(token, "PRIOR_DEFOCUS_CENTER") == 0)
-        {
-          token = strtok(NULL, " ");
-          param_device.Priordefcent=atof(token);
-          cout << "Gaussian Center in Prior of defocus parameter: " << param_device.Priordefcent << "\n";
-        }
 
-      else if (strcmp(token, "IGNORE_POINTSOUT") == 0)
-        {
-	  ignorepointsout=true;
-          cout << "Ignoring model points outside the map\n" ;
-        }
-     else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0)//Number of Euler angle tripplets in non uniform Euler angle sampling
-        {
-          printrotmod=true;
-          cout << "PRINTING out rotatted models (best for debugging)\n";
-        }
+  while (!input.eof())
+  {
+    input.getline(line, 512);
+    strcpy(saveline, line);
+    char *token = strtok(line, " ");
 
+    if (token == NULL || line[0] == '#' || strlen(token) == 0)
+    {
+      // comment or blank line
+    }
+    else if (strcmp(token, "PIXEL_SIZE") == 0)
+    {
+      token = strtok(NULL, " ");
+      pixelSize = atof(token);
+      if (pixelSize < 0)
+      {
+        cout << "*** Error: Negative pixelSize ";
+        exit(1);
+      }
+      cout << "Pixel Sixe " << pixelSize << "\n";
+      yesPixSi = true;
+    }
+    else if (strcmp(token, "NUMBER_PIXELS") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.NumberPixels = int(atoi(token));
+      if (param_device.NumberPixels < 0)
+      {
+        cout << "*** Error: Negative Number of Pixels ";
+        exit(1);
+      }
+      cout << "Number of Pixels " << param_device.NumberPixels << "\n";
+      yesNumPix = true;
+    }
+    else if (strcmp(token, "GRIDPOINTS_ALPHA") == 0)
+    {
+      token = strtok(NULL, " ");
+      angleGridPointsAlpha = int(atoi(token));
+      if (angleGridPointsAlpha < 0)
+      {
+        cout << "*** Error: Negative GRIDPOINTS_ALPHA ";
+        exit(1);
+      }
+      cout << "Grid points alpha " << angleGridPointsAlpha << "\n";
+      yesGPal = true;
+    }
+    else if (strcmp(token, "GRIDPOINTS_BETA") == 0)
+    {
+      token = strtok(NULL, " ");
+      angleGridPointsBeta = int(atoi(token));
+      if (angleGridPointsBeta < 0)
+      {
+        cout << "*** Error: Negative GRIDPOINTS_BETA ";
+        exit(1);
+      }
+      cout << "Grid points in Cosine ( beta ) " << angleGridPointsBeta << "\n";
+      yesGPbe = true;
+    }
+    else if (strcmp(token, "USE_QUATERNIONS") == 0)
+    //        else if (token=="USE_QUATERNIONS")
+    {
+      cout << "Orientations with Quaternions. \n";
+      doquater = true;
+    }
+    else if (strcmp(token, "GRIDPOINTS_QUATERNION") == 0)
+    {
+      if (!notuniformangles)
+      {
+        token = strtok(NULL, " ");
+        GridPointsQuatern = int(atoi(token));
+        cout << "Gridpoints Quaternions " << GridPointsQuatern << "\n";
+      }
+      else
+      {
+        cout << "Inconsitent Input: Grid or List with Quaternions?\n";
+        exit(1);
+      }
+      yesquatgrid = true;
+      doquater = true;
+    }
+    // CTF PARAMETERS
+    else if (strcmp(token, "CTF_B_ENV") == 0)
+    {
+      token = strtok(NULL, " ");
+      startBfactor = atof(token);
+      if (startBfactor < 0)
+      {
+        cout << "*** Error: Negative START B Env ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endBfactor = atof(token);
+      if (endBfactor < 0)
+      {
+        cout << "*** Error: Negative END B Env ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsEnvelop = int(atoi(token));
+      if (numberGridPointsEnvelop < 0)
+      {
+        cout << "*** Error: Negative Number of Grid points BEnv ";
+        exit(1);
+      }
+      cout << "Grid CTF B-ENV: " << startBfactor << " " << endBfactor << " "
+           << numberGridPointsEnvelop << "\n";
+      if (startBfactor > endBfactor)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      yesBFact = true;
+    }
+    else if (strcmp(token, "CTF_DEFOCUS") == 0)
+    {
+      token = strtok(NULL, " ");
+      startDefocus = atof(token);
+      if (startDefocus < 0)
+      {
+        cout << "*** Error: Negative START Defocus ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endDefocus = atof(token);
+      if (endDefocus < 0)
+      {
+        cout << "*** Error: Negative END Defocus ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsCTF_phase = int(atoi(token));
+      if (numberGridPointsCTF_phase < 0)
+      {
+        cout << "*** Error: Negative Number of Grid points Defocus ";
+        exit(1);
+      }
+      cout << "Grid CTF Defocus: " << startDefocus << " " << endDefocus << " "
+           << numberGridPointsCTF_phase << "\n";
+      if (startDefocus > endDefocus)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      if (endDefocus > 8.)
+      {
+        cout << "Defocus beyond 8micro-m range is not allowed \n";
+        exit(1);
+      }
+      yesDefocus = true;
+    }
+    else if (strcmp(token, "CTF_AMPLITUDE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_amp = atof(token);
+      if (startGridCTF_amp < 0)
+      {
+        cout << "*** Error: Negative START Amplitude ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endGridCTF_amp = atof(token);
+      if (endGridCTF_amp < 0)
+      {
+        cout << "*** Error: Negative END Amplitude";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsCTF_amp = int(atoi(token));
+      if (numberGridPointsCTF_amp < 0)
+      {
+        cout << "*** Error: Negative Number of grid points amplitude ";
+        exit(1);
+      }
+      cout << "Grid Amplitude: " << startGridCTF_amp << " " << endGridCTF_amp
+           << " " << numberGridPointsCTF_amp << "\n";
+      if (startGridCTF_amp > endGridCTF_amp)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      yesAMP = true;
+    }
+    else if (strcmp(token, "ELECTRON_WAVELENGTH") == 0)
+    {
+      token = strtok(NULL, " ");
+      elecwavel = atof(token);
+      if (elecwavel < 0.0150)
+      {
+        cout << "Wrong electron wave length " << elecwavel << "\n";
+        cout << "Has to be in Angstrom (A)\n";
+        exit(1);
+      }
+      cout << "Electron wave length in (A) is: " << elecwavel << "\n";
+    }
+    // PSF PARAMETERS
+    else if (strcmp(token, "USE_PSF") == 0)
+    {
+      usepsf = true;
+      param_device.tousepsf = true;
+      cout << "IMPORTANT: Using Point Spread Function. Thus, all parameters "
+              "are in Real Space. \n";
+    }
+    else if (strcmp(token, "PSF_AMPLITUDE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_amp = atof(token);
+      if (startGridCTF_amp < 0)
+      {
+        cout << "*** Error: Negative START Amplitude ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endGridCTF_amp = atof(token);
+      if (endGridCTF_amp < 0)
+      {
+        cout << "*** Error: Negative END Amplitude";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsCTF_amp = int(atoi(token));
+      if (numberGridPointsCTF_amp < 0)
+      {
+        cout << "*** Error: Negative Number of grid points amplitude ";
+        exit(1);
+      }
+      cout << "Grid Amplitude: " << startGridCTF_amp << " " << endGridCTF_amp
+           << " " << numberGridPointsCTF_amp << "\n";
+      if (startGridCTF_amp > endGridCTF_amp)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      yesAMP = true;
+    }
+    else if (strcmp(token, "PSF_ENVELOPE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridEnvelop = atof(token);
+      if (startGridEnvelop < 0)
+      {
+        cout << "*** Error: Negative START PSF Env. ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endGridEnvelop = atof(token);
+      if (endGridEnvelop < 0)
+      {
+        cout << "*** Error: Negative END  PSF Env. ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsEnvelop = int(atoi(token));
+      if (numberGridPointsEnvelop < 0)
+      {
+        cout << "*** Error: Negative Number of grid points  PSF Env. ";
+        exit(1);
+      }
+      cout << "Grid PSF Envelope: " << startGridEnvelop << " " << endGridEnvelop
+           << " " << numberGridPointsEnvelop << "\n";
+      if (startGridEnvelop > endGridEnvelop)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      yesPSFenv = true;
+    }
+    else if (strcmp(token, "PSF_PHASE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_phase = atof(token);
+      if (startGridCTF_phase < 0)
+      {
+        cout << "*** Error: Negative START Amplitud ";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      endGridCTF_phase = atof(token);
+      if (endGridCTF_phase < 0)
+      {
+        cout << "*** Error: Negative END Amplitud";
+        exit(1);
+      }
+      token = strtok(NULL, " ");
+      numberGridPointsCTF_phase = int(atoi(token));
+      if (numberGridPointsCTF_phase < 0)
+      {
+        cout << "*** Error: Negative Number of grid points amplitud ";
+        exit(1);
+      }
+      cout << "Grid PSF phase: " << startGridCTF_phase << " "
+           << endGridCTF_phase << " " << numberGridPointsCTF_phase << "\n";
+      if (startGridCTF_phase > endGridCTF_phase)
+      {
+        cout << "Error: Grid ill defined END > START\n";
+        exit(1);
+      };
+      yesPSFpha = true;
+    }
+    else if (strcmp(token, "DISPLACE_CENTER") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.maxDisplaceCenter = int(atoi(token));
+      if (param_device.maxDisplaceCenter < 0)
+      {
+        cout << "*** Error: Negative MAX_D_CENTER ";
+        exit(1);
+      }
+      cout << "Maximum displacement Center " << param_device.maxDisplaceCenter
+           << "\n";
+      token = strtok(NULL, " ");
+      param_device.GridSpaceCenter = int(atoi(token));
+      if (param_device.GridSpaceCenter < 0)
+      {
+        cout << "*** Error: Negative PIXEL_GRID_CENTER ";
+        exit(1);
+      }
+      cout << "Grid space displacement center " << param_device.GridSpaceCenter
+           << "\n";
+      yesMDC = true;
+    }
+    else if (strcmp(token, "WRITE_PROB_ANGLES") ==
+             0) // Key word if writing down each angle probabilities
+    {
+      token = strtok(NULL, " ");
+      param_device.writeAngles = int(atoi(token));
+      if (param_device.writeAngles < 0)
+      {
+        cout << "*** Error: Negative WRITE_PROB_ANGLES ";
+        exit(1);
+      }
+      cout << "Writing " << param_device.writeAngles
+           << " Probabilies of each angle \n";
+    }
+    else if (strcmp(token, "IGNORE_PDB") == 0) // Ignore PDB extension
+    {
+      ignorePDB = true;
+      cout << "Ignoring PDB extension in model file \n";
+    }
+    else if (strcmp(token, "NO_PROJECT_RADIUS") ==
+             0) // If projecting CA with amino-acid radius
+    {
+      doaaradius = false;
+      cout << "Not Projecting corresponding radius \n";
+    }
+    else if (strcmp(token, "WRITE_CTF_PARAM") == 0) // Number of Euler angle
+                                                    // tripplets in non uniform
+                                                    // Euler angle sampling
+    {
+      writeCTF = true;
+      token = strtok(NULL, " ");
+      cout << "Writing CTF parameters from PSF parameters that maximize the "
+              "posterior. \n";
+    }
+    else if (strcmp(token, "NO_CENTEROFMASS") == 0) // Number of Euler angle
+                                                    // tripplets in non uniform
+                                                    // Euler angle sampling
+    {
+      nocentermass = true;
+      cout << "BE CAREFUL CENTER OF MASS IS NOT REMOVED \n Calculated images "
+              "might be out of range \n";
+    }
+    else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0) // Number of Euler
+                                                         // angle tripplets in
+                                                         // non uniform Euler
+                                                         // angle sampling
+    {
+      printrotmod = true;
+      cout << "PRINTING out rotatted models (best for debugging)\n";
+    }
+    else if (strcmp(token, "NO_MAP_NORM") == 0)
+    {
+      notnormmap = true;
+      cout << "NOT NORMALIZING MAP\n";
+    }
+    else if (strcmp(token, "PRIOR_MODEL") == 0)
+    {
+      token = strtok(NULL, " ");
+      priorMod = atof(token);
+      cout << "MODEL PRIOR Probability " << priorMod << "\n";
+    }
+    else if (strcmp(token, "PRIOR_ANGLES") == 0)
+    {
+      yespriorAngles = true;
+      cout << "READING Priors for Orientations in additonal orientation file\n";
+    }
+    else if (strcmp(token, "SHIFT_X") == 0)
+    {
+      token = strtok(NULL, " ");
+      shiftX = atoi(token);
+      cout << "Shifting initial model X by " << shiftX << "\n";
+    }
+    else if (strcmp(token, "SHIFT_Y") == 0)
+    {
+      token = strtok(NULL, " ");
+      shiftY = atoi(token);
+      cout << "Shifting initial model Y by " << shiftY << "\n";
+    }
+    else if (strcmp(token, "SIGMA_PRIOR_B_CTF") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.sigmaPriorbctf = atof(token);
+      cout << "Chainging  Gaussian width in Prior of Envelope b parameter: "
+           << param_device.sigmaPriorbctf << "\n";
+    }
+    else if (strcmp(token, "SIGMA_PRIOR_DEFOCUS") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.sigmaPriordefo = atof(token);
+      cout << "Gaussian Width in Prior of defocus parameter: "
+           << param_device.sigmaPriordefo << "\n";
+    }
+    else if (strcmp(token, "PRIOR_DEFOCUS_CENTER") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.Priordefcent = atof(token);
+      cout << "Gaussian Center in Prior of defocus parameter: "
+           << param_device.Priordefcent << "\n";
+    }
+    else if (strcmp(token, "SIGMA_PRIOR_AMP_CTF") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.sigmaPrioramp = atof(token);
+      cout << "Gaussian Width in Prior of defocus parameter: "
+           << param_device.sigmaPriordefo << "\n";
+    }
+    else if (strcmp(token, "PRIOR_AMP_CTF_CENTER") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.Priorampcent = atof(token);
+      cout << "Gaussian Center in Prior of defocus parameter: "
+           << param_device.Priordefcent << "\n";
+    }
 
+    else if (strcmp(token, "IGNORE_POINTSOUT") == 0)
+    {
+      ignorepointsout = true;
+      cout << "Ignoring model points outside the map\n";
+    }
+    else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0) // Number of Euler
+                                                         // angle tripplets in
+                                                         // non uniform Euler
+                                                         // angle sampling
+    {
+      printrotmod = true;
+      cout << "PRINTING out rotatted models (best for debugging)\n";
     }
+  }
   input.close();
 
   //************** Checks/Controlls for INPUT
 
-  if( not ( yesPixSi ) ){ cout << "**** INPUT MISSING: Please provide PIXEL_SIZE\n" ; exit (1);};
-  if( not ( yesNumPix ) ){ cout << "**** INPUT MISSING: Please provide NUMBER_PIXELS \n" ; exit (1);};
-  if(!notuniformangles){
-    if(!doquater){
-      if( not ( yesGPal) ) { cout << "**** INPUT MISSING: Please provide GRIDPOINTS_ALPHA \n" ; exit (1);};
-      if( not ( yesGPbe )) { cout << "**** INPUT MISSING: Please provide GRIDPOINTS_BETA \n" ; exit (1);};
-    }else if (!yesquatgrid){
-      cout << "**** INPUT MISSING: Please provide GRIDPOINTS_QUATERNION \n" ; exit (1);
+  if (not(yesPixSi))
+  {
+    cout << "**** INPUT MISSING: Please provide PIXEL_SIZE\n";
+    exit(1);
+  };
+  if (not(yesNumPix))
+  {
+
+    cout << "**** INPUT MISSING: Please provide NUMBER_PIXELS \n";
+    exit(1);
+  };
+  if (!notuniformangles)
+  {
+    if (!doquater)
+    {
+      if (not(yesGPal))
+      {
+        cout << "**** INPUT MISSING: Please provide GRIDPOINTS_ALPHA \n";
+        exit(1);
+      };
+      if (not(yesGPbe))
+      {
+        cout << "**** INPUT MISSING: Please provide GRIDPOINTS_BETA \n";
+        exit(1);
+      };
+    }
+    else if (!yesquatgrid)
+    {
+      cout << "**** INPUT MISSING: Please provide GRIDPOINTS_QUATERNION \n";
+      exit(1);
     }
   }
-  if( not (  yesMDC  ) ) { cout << "**** INPUT MISSING: Please provide GRID Displacement CENTER \n" ; exit (1);};
-  if( param_device.writeCC && param_device.CCdisplace < 1 ){ cout << "**** INPUT MISSING: Please provide CROSSCOR_DISPLACE \n" ; exit (1);};
-  if( param_device.writeCC) {if(!param_device.CCwithBayes ){  cout << "Remark:: Not Using Bayesian method to store Cross-Correlation.\n Only Printing out Maximum\n";}
-    if(param_device.flipped){ cout << "Remark:: Micrographs are Flipped = Particles are white\n";} else {  cout << "Remark:: Micrographs are NOT Flipped = Particles are dark\n";}
-    if(param_device.writeAngles){cout << "Calculate Cross-cor and write prob angles are mutualy exclusive options\n"; exit(1);}
-  }
-
+  if (not(yesMDC))
+  {
+    cout << "**** INPUT MISSING: Please provide GRID Displacement CENTER \n";
+    exit(1);
+  };
 
-  cout << "To verify input of Priors:\n" ;
+  cout << "To verify input of Priors:\n";
   cout << "Sigma Prior B-Env: " << param_device.sigmaPriorbctf << "\n";
   cout << "Sigma Prior Defocus: " << param_device.sigmaPriordefo << "\n";
-  cout << "Center Prior Defocus: " <<param_device.Priordefcent <<"\n";
+  cout << "Center Prior Defocus: " << param_device.Priordefcent << "\n";
 
   // PSF or CTF Checks and asigments
-  if(usepsf){
-    if( not ( yesPSFpha ) ){ cout << "**** INPUT MISSING: Please provide Grid PSF PHASE \n" ; exit (1);};
-    if( not ( yesPSFenv ) ){ cout << "**** INPUT MISSING: Please provide Grid PSF ENVELOPE \n" ; exit (1);};
-    if( not ( yesAMP ) ){ cout << "**** INPUT MISSING: Please provide Grid PSF AMPLITUD \n" ; exit (1);};
-  } else {
-    //cout << "**Note:: Calculation using CTF values (not PSF). If this is not correct then key word: USE_PSF missing in inputfile**\n";	
-    if( not ( yesBFact ) ){ cout << "**** INPUT MISSING: Please provide Grid CTF B-ENV \n" ; exit (1);};
-    if( not ( yesDefocus ) ){ cout << "**** INPUT MISSING: Please provide Grid CTF Defocus \n" ; exit (1);};
-    if( not ( yesAMP ) ){ cout << "**** INPUT MISSING: Please provide Grid CTF AMPLITUD \n" ; exit (1);};
-    // Asigning values of phase according to defocus    
-    startGridCTF_phase= startDefocus * M_PI * 2.f * 10000 * elecwavel ;
-    endGridCTF_phase= endDefocus * M_PI * 2.f * 10000 * elecwavel ;
-    //Asigning values of envelope according to b-envelope (not b-factor)
-    startGridEnvelop = startBfactor ;// 2.f;
-    endGridEnvelop = endBfactor ; // / 2.f;
-    param_device.Priordefcent *= M_PI * 2.f * 10000 * elecwavel ;
-    param_device.sigmaPriordefo *= M_PI * 2.f * 10000 * elecwavel ;
+  if (usepsf)
+  {
+    if (not(yesPSFpha))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid PSF PHASE \n";
+      exit(1);
+    };
+    if (not(yesPSFenv))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid PSF ENVELOPE \n";
+      exit(1);
+    };
+    if (not(yesAMP))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid PSF AMPLITUD \n";
+      exit(1);
+    };
+  }
+  else
+  {
+    // cout << "**Note:: Calculation using CTF values (not PSF). If this is not
+    // correct then key word: USE_PSF missing in inputfile**\n";
+    if (not(yesBFact))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid CTF B-ENV \n";
+      exit(1);
+    };
+    if (not(yesDefocus))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid CTF Defocus \n";
+      exit(1);
+    };
+    if (not(yesAMP))
+    {
+      cout << "**** INPUT MISSING: Please provide Grid CTF AMPLITUD \n";
+      exit(1);
+    };
+    // Asigning values of phase according to defocus
+    startGridCTF_phase = startDefocus * M_PI * 2.f * 10000 * elecwavel;
+    endGridCTF_phase = endDefocus * M_PI * 2.f * 10000 * elecwavel;
+    // Asigning values of envelope according to b-envelope (not b-factor)
+    startGridEnvelop = startBfactor; // 2.f;
+    endGridEnvelop = endBfactor;     // / 2.f;
+    param_device.Priordefcent *= M_PI * 2.f * 10000 * elecwavel;
+    param_device.sigmaPriordefo *= M_PI * 2.f * 10000 * elecwavel;
   }
 
-  if(elecwavel==0.019688)cout << "Using default electron wave length: 0.019688 (A) of 300kV microscope\n";
+  if (elecwavel == 0.019688)
+    cout << "Using default electron wave length: 0.019688 (A) of 300kV "
+            "microscope\n";
 
   param_device.NumberFFTPixels1D = param_device.NumberPixels / 2 + 1;
   FFTMapSize = param_device.NumberPixels * param_device.NumberFFTPixels1D;
 
-  if(writeCTF && !usepsf){
-    cout << "Writing CTF is only valid when integrating over the PSF\n"; exit(1);
+  nTotParallelMaps = CUDA_FFTS_AT_ONCE;
+
+  if (writeCTF && !usepsf)
+  {
+    cout << "Writing CTF is only valid when integrating over the PSF\n";
+    exit(1);
   }
 
   cout << " +++++++++++++++++++++++++++++++++++++++++ \n";
 
-  return(0);
+  return (0);
 }
 
-int bioem_param::forprintBest(const char* fileinput)
+int bioem_param::forprintBest(const char *fileinput)
 {
   // **************************************************************************************
-  // **********Alternative parameter routine for only printing out a map ******************
+  // **********Alternative parameter routine for only printing out a map
+  // ******************
 
   ifstream input(fileinput);
-  withnoise=false;
-  showrotatemod=false;
-
-  param_device.flipped=false;
-  param_device.debugterm=false;
-  param_device.writeCC=false;
-  param_device.CCwithBayes=true;
-  writeCTF=false;
-  elecwavel=0.019866;
-  ignoreCCoff=false;
-  doquater=false;
-  nocentermass=false;
-  printrotmod=false;
-  readquatlist=false;
-  doaaradius=true;
-  shiftX=0;
-  shiftY=0;
-
+  withnoise = false;
+  showrotatemod = false;
+
+  writeCTF = false;
+  elecwavel = 0.019866;
+  doquater = false;
+  nocentermass = false;
+  printrotmod = false;
+  readquatlist = false;
+  doaaradius = true;
+  shiftX = 0;
+  shiftY = 0;
+  stnoise = 1;
 
   //**** Different keywords! For printing MAP ************
   if (!input.good())
-    {
-      cout << "Failed to open Best Parameter file: " << fileinput << "\n";
-      exit(1);
-    }
+  {
+    cout << "Failed to open Best Parameter file: " << fileinput << "\n";
+    exit(1);
+  }
 
   delete[] angles;
-  angles = new myfloat3_t[ 1 ] ; //Only best orientation
+  angles = new myfloat3_t[1]; // Only best orientation
 
   char line[512] = {0};
   char saveline[512];
-  bool ctfparam=false;
+  bool ctfparam = false;
 
-  usepsf=false;
+  usepsf = false;
 
   cout << "\n +++++++++++++++++++++++++++++++++++++++++ \n";
   cout << "\n     ONLY READING BEST PARAMETERS \n";
   cout << "\n     FOR PRINTING MAXIMIZED MAP \n";
   cout << " +++++++++++++++++++++++++++++++++++++++++ \n";
   while (!input.eof())
-    {
-      input.getline(line, 512);
-      strcpy(saveline, line);
-      char *token = strtok(line, " ");
-
-      if (token == NULL || line[0] == '#' || strlen(token) == 0)
-	{
-	  // comment or blank line
-	}
-      else if (strcmp(token, "PIXEL_SIZE") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  pixelSize = atof(token);
-	  if (pixelSize < 0 ) { cout << "*** Error: Negative pixelSize "; exit(1);}
-	  cout << "Pixel Sixe " << pixelSize << "\n";
-	}
-      else if (strcmp(token, "NUMBER_PIXELS") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  param_device.NumberPixels = int(atoi(token));
-	  if (param_device.NumberPixels < 0 ) { cout << "*** Error: Negative Number of Pixels "; exit(1);}
-	  cout << "Number of Pixels " << param_device.NumberPixels << "\n";
-	}
-      else if (strcmp(token, "BEST_ALPHA") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  angles[0].pos[0] = atof(token);
-	  cout << "Best Alpha " <<  angles[0].pos[0] << "\n";
-	}
-      else if (strcmp(token, "BEST_BETA") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  angles[0].pos[1] = atof(token);
-	  cout << "Best beta " <<  angles[0].pos[1] << "\n";
-	}
-      else if (strcmp(token, "BEST_GAMMA") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  angles[0].pos[2] = atof(token);
-	  cout << "Best Gamma " <<  angles[0].pos[2] << "\n";
-	}
-      else if (strcmp(token, "USE_QUATERNIONS") == 0)
-        {
-          cout << "Orientations with Quaternions. \n";
-          doquater= true;
-        }
-      else if (strcmp(token, "BEST_Q1") == 0)
-        {
-          token = strtok(NULL, " ");
-          angles[0].pos[0] = atof(token);
-          cout << "Best q1 " <<  angles[0].pos[0] << "\n";
-        }
-      else if (strcmp(token, "BEST_Q2") == 0)
-        {
-          token = strtok(NULL, " ");
-          angles[0].pos[1] = atof(token);
-          cout << "Best q2 " <<  angles[0].pos[1] << "\n";
-        }
-      else if (strcmp(token, "BEST_Q3") == 0)
-        {
-          token = strtok(NULL, " ");
-          angles[0].pos[2] = atof(token);
-          cout << "Best Q3 " <<  angles[0].pos[2] << "\n";
-        }
-            else if (strcmp(token, "BEST_Q4") == 0)
-        {
-          token = strtok(NULL, " ");
-          angles[0].quat4= atof(token);
-          cout << "Best Q3 " <<  angles[0].quat4 << "\n";
-        }
+  {
+    input.getline(line, 512);
+    strcpy(saveline, line);
+    char *token = strtok(line, " ");
 
-      else if (strcmp(token, "USE_PSF") == 0)
-        {
-          usepsf=true;
-          cout << "IMPORTANT: Using Point Spread Function. Thus, all parameters are in Real Space. \n";
-        }
-      else if (strcmp(token, "BEST_PSF_ENVELOPE") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  startGridEnvelop = atof(token);
-	  if (startGridEnvelop < 0 ) { cout << "*** Error: Negative START_ENVELOPE "; exit(1);}
-	  cout << "Best Envelope PSF " << startGridEnvelop << "\n";
-	}
-      else if (strcmp(token,"BEST_PSF_PHASE")==0)
-	{
-	  token = strtok(NULL," ");
-	  startGridCTF_phase=atof(token);
-	  cout << "Best Phase PSF " << startGridCTF_phase << "\n";
-	}
-      else if (strcmp(token,"BEST_PSF_AMP")==0)
-	{
-	  token = strtok(NULL," ");
-	  startGridCTF_amp=atof(token);
-          if(startGridCTF_amp <0){cout << "Error Negative Amplitud\n";exit(1);}
-	  cout << "Best Amplitud PSF " << startGridCTF_amp << "\n";
-	}
-      else if (strcmp(token, "BEST_CTF_B_ENV") == 0)
-        {
-          token = strtok(NULL, " ");
-          startGridEnvelop = atof(token);// / 2.f;
-          if (startGridEnvelop < 0 ) { cout << "*** Error: Negative START B-Env "; exit(1);}
-          cout << "Best B- Env " << startGridEnvelop << "\n";
-	  ctfparam=true;
-        }
-      else if (strcmp(token,"BEST_CTF_DEFOCUS")==0)
-        {
-          token = strtok(NULL," ");
-          startGridCTF_phase=atof(token)* M_PI * 2.f * 10000 * elecwavel;
-          cout << "Best Defocus " << startGridCTF_phase << "\n";
-          ctfparam=true;
-        }
-      else if (strcmp(token,"BEST_CTF_AMP")==0)
-        {
-          token = strtok(NULL," ");
-          startGridCTF_amp=atof(token);
-          if(startGridCTF_amp <0){cout << "Error Negative Amplitud\n";exit(1);}
-          cout << "Best Amplitud " << startGridCTF_amp << "\n";
-          ctfparam=true;
-        }
-      else if (strcmp(token, "BEST_DX") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  ddx = atoi(token);
-	  cout << "Best dx " << ddx << "\n";
-	}
-      else if (strcmp(token, "BEST_DY") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  ddy = atoi(token);
-	  cout << "Best dy " << ddy << "\n";
-	}
-      else if (strcmp(token, "BEST_NORM") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  bestnorm= atof(token);
-	  cout << "Best norm " << bestnorm << "\n";
-	}
-      else if (strcmp(token, "BEST_OFFSET") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  bestoff = atof(token);
-	  cout << "Best offset " << bestoff << "\n";
-	}
-      else if (strcmp(token, "WITHNOISE") == 0)
-	{
-	  token = strtok(NULL, " ");
-	  stnoise = atof(token);
-	  withnoise=true;
-	  cout << "Including noise with standard deviation " << stnoise << "\n";
-	}
-      else if (strcmp(token, "NO_PROJECT_RADIUS") == 0) //If projecting CA with amino-acid radius
-	{
-	  doaaradius = false;
-	  cout << "Not projecting corresponding radius \n";
-	}
-      else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0)//Number of Euler angle tripplets in non uniform Euler angle sampling
-	{
-	  printrotmod=true;
-	  cout << "PRINTING out rotatted models (best for debugging)\n";
-	}
-      else if (strcmp(token, "SHIFT_X") == 0)
-        {
-          token = strtok(NULL, " ");
-          shiftX=atoi(token);
-          cout << "Shifting initial model X by "<< shiftX << "\n" ;
-        }
-      else if (strcmp(token, "SHIFT_Y") == 0)
-        {
-          token = strtok(NULL, " ");
-          shiftY=atoi(token);
-          cout << "Shifting initial model Y by "<< shiftY << "\n" ;
-        }
+    if (token == NULL || line[0] == '#' || strlen(token) == 0)
+    {
+      // comment or blank line
+    }
+    else if (strcmp(token, "PIXEL_SIZE") == 0)
+    {
+      token = strtok(NULL, " ");
+      pixelSize = atof(token);
+      if (pixelSize < 0)
+      {
+        cout << "*** Error: Negative pixelSize ";
+        exit(1);
+      }
+      cout << "Pixel Sixe " << pixelSize << "\n";
+    }
+    else if (strcmp(token, "NUMBER_PIXELS") == 0)
+    {
+      token = strtok(NULL, " ");
+      param_device.NumberPixels = int(atoi(token));
+      if (param_device.NumberPixels < 0)
+      {
+        cout << "*** Error: Negative Number of Pixels ";
+        exit(1);
+      }
+      cout << "Number of Pixels " << param_device.NumberPixels << "\n";
+    }
+    else if (strcmp(token, "BEST_ALPHA") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[0] = atof(token);
+      cout << "Best Alpha " << angles[0].pos[0] << "\n";
+    }
+    else if (strcmp(token, "BEST_BETA") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[1] = atof(token);
+      cout << "Best beta " << angles[0].pos[1] << "\n";
+    }
+    else if (strcmp(token, "BEST_GAMMA") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[2] = atof(token);
+      cout << "Best Gamma " << angles[0].pos[2] << "\n";
+    }
+    else if (strcmp(token, "USE_QUATERNIONS") == 0)
+    {
+      cout << "Orientations with Quaternions. \n";
+      doquater = true;
+    }
+    else if (strcmp(token, "BEST_Q1") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[0] = atof(token);
+      cout << "Best q1 " << angles[0].pos[0] << "\n";
+    }
+    else if (strcmp(token, "BEST_Q2") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[1] = atof(token);
+      cout << "Best q2 " << angles[0].pos[1] << "\n";
+    }
+    else if (strcmp(token, "BEST_Q3") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].pos[2] = atof(token);
+      cout << "Best Q3 " << angles[0].pos[2] << "\n";
+    }
+    else if (strcmp(token, "BEST_Q4") == 0)
+    {
+      token = strtok(NULL, " ");
+      angles[0].quat4 = atof(token);
+      cout << "Best Q3 " << angles[0].quat4 << "\n";
+    }
 
+    else if (strcmp(token, "USE_PSF") == 0)
+    {
+      usepsf = true;
+      cout << "IMPORTANT: Using Point Spread Function. Thus, all parameters "
+              "are in Real Space. \n";
+    }
+    else if (strcmp(token, "BEST_PSF_ENVELOPE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridEnvelop = atof(token);
+      if (startGridEnvelop < 0)
+      {
+        cout << "*** Error: Negative START_ENVELOPE ";
+        exit(1);
+      }
+      cout << "Best Envelope PSF " << startGridEnvelop << "\n";
+    }
+    else if (strcmp(token, "BEST_PSF_PHASE") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_phase = atof(token);
+      cout << "Best Phase PSF " << startGridCTF_phase << "\n";
+    }
+    else if (strcmp(token, "BEST_PSF_AMP") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_amp = atof(token);
+      if (startGridCTF_amp < 0)
+      {
+        cout << "Error Negative Amplitud\n";
+        exit(1);
+      }
+      cout << "Best Amplitud PSF " << startGridCTF_amp << "\n";
+    }
+    else if (strcmp(token, "BEST_CTF_B_ENV") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridEnvelop = atof(token); // / 2.f;
+      if (startGridEnvelop < 0)
+      {
+        cout << "*** Error: Negative START B-Env ";
+        exit(1);
+      }
+      cout << "Best B- Env " << startGridEnvelop << "\n";
+      ctfparam = true;
+    }
+    else if (strcmp(token, "BEST_CTF_DEFOCUS") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_phase = atof(token) * M_PI * 2.f * 10000 * elecwavel;
+      cout << "Best Defocus " << startGridCTF_phase << "\n";
+      ctfparam = true;
+    }
+    else if (strcmp(token, "BEST_CTF_AMP") == 0)
+    {
+      token = strtok(NULL, " ");
+      startGridCTF_amp = atof(token);
+      if (startGridCTF_amp < 0)
+      {
+        cout << "Error Negative Amplitud\n";
+        exit(1);
+      }
+      cout << "Best Amplitud " << startGridCTF_amp << "\n";
+      ctfparam = true;
     }
+    else if (strcmp(token, "BEST_DX") == 0)
+    {
+      token = strtok(NULL, " ");
+      ddx = atoi(token);
+      cout << "Best dx " << ddx << "\n";
+    }
+    else if (strcmp(token, "BEST_DY") == 0)
+    {
+      token = strtok(NULL, " ");
+      ddy = atoi(token);
+      cout << "Best dy " << ddy << "\n";
+    }
+    else if (strcmp(token, "BEST_NORM") == 0)
+    {
+      token = strtok(NULL, " ");
+      bestnorm = atof(token);
+      cout << "Best norm " << bestnorm << "\n";
+    }
+    else if (strcmp(token, "BEST_OFFSET") == 0)
+    {
+      token = strtok(NULL, " ");
+      bestoff = atof(token);
+      cout << "Best offset " << bestoff << "\n";
+    }
+    else if (strcmp(token, "WITHNOISE") == 0)
+    {
+      token = strtok(NULL, " ");
+      stnoise = atof(token);
+      withnoise = true;
+      cout << "Including noise with standard deviation " << stnoise << "\n";
+    }
+    else if (strcmp(token, "NO_PROJECT_RADIUS") ==
+             0) // If projecting CA with amino-acid radius
+    {
+      doaaradius = false;
+      cout << "Not projecting corresponding radius \n";
+    }
+    else if (strcmp(token, "PRINT_ROTATED_MODELS") == 0) // Number of Euler
+                                                         // angle tripplets in
+                                                         // non uniform Euler
+                                                         // angle sampling
+    {
+      printrotmod = true;
+      cout << "PRINTING out rotatted models (best for debugging)\n";
+    }
+    else if (strcmp(token, "SHIFT_X") == 0)
+    {
+      token = strtok(NULL, " ");
+      shiftX = atoi(token);
+      cout << "Shifting initial model X by " << shiftX << "\n";
+    }
+    else if (strcmp(token, "SHIFT_Y") == 0)
+    {
+      token = strtok(NULL, " ");
+      shiftY = atoi(token);
+      cout << "Shifting initial model Y by " << shiftY << "\n";
+    }
+  }
 
-  if(doquater){
-            if(angles[0].quat4*angles[0].quat4>1){cout << " Problem with quaternion "<< angles[0].quat4 << "\n";exit(1);}
-	    if(angles[0].pos[0]*angles[0].pos[0]>1){cout << " Problem with quaternion "<< angles[0].pos[0] << "\n";exit(1);}
-	    if(angles[0].pos[1]*angles[0].pos[1]>1){cout << " Problem with quaternion "<< angles[0].pos[1] << "\n";exit(1);}
-	    if(angles[0].pos[2]*angles[0].pos[2]>1){cout << " Problem with quaternion "<< angles[0].pos[2] << "\n";exit(1);}
-	}
+  if (doquater)
+  {
+    if (angles[0].quat4 * angles[0].quat4 > 1)
+    {
+      cout << " Problem with quaternion " << angles[0].quat4 << "\n";
+      exit(1);
+    }
+    if (angles[0].pos[0] * angles[0].pos[0] > 1)
+    {
+      cout << " Problem with quaternion " << angles[0].pos[0] << "\n";
+      exit(1);
+    }
+    if (angles[0].pos[1] * angles[0].pos[1] > 1)
+    {
+      cout << " Problem with quaternion " << angles[0].pos[1] << "\n";
+      exit(1);
+    }
+    if (angles[0].pos[2] * angles[0].pos[2] > 1)
+    {
+      cout << " Problem with quaternion " << angles[0].pos[2] << "\n";
+      exit(1);
+    }
+  }
 
   input.close();
 
-  if(usepsf &&  ctfparam){
-    cout << "Inconsitent Input: Using both PSF and CTF ?\n"; exit(1);
+  if (usepsf && ctfparam)
+  {
+    cout << "Inconsitent Input: Using both PSF and CTF ?\n";
+    exit(1);
   }
 
-  //Automatic definitions
-  numberGridPointsCTF_amp = 1 ;
+  // Automatic definitions
+  numberGridPointsCTF_amp = 1;
   gridCTF_amp = startGridCTF_amp;
   numberGridPointsCTF_phase = 1;
   gridCTF_phase = startGridCTF_phase;
-  numberGridPointsEnvelop = 1 ;
+  numberGridPointsEnvelop = 1;
   gridEnvelop = startGridEnvelop;
-  doquater=false;
+  // doquater = false;
 
   param_device.NumberFFTPixels1D = param_device.NumberPixels / 2 + 1;
   FFTMapSize = param_device.NumberPixels * param_device.NumberFFTPixels1D;
 
-  return 0;
+  nTotParallelMaps = CUDA_FFTS_AT_ONCE;
 
+  return 0;
 }
 
 void bioem_param::PrepareFFTs()
 {
   //********** PREPARING THE PLANS FOR THE FFTS ******************
-  if (mpi_rank == 0) cout << "Preparing FFTs\n";
+  if (mpi_rank == 0)
+    cout << "Preparing FFTs\n";
   releaseFFTPlans();
   mycomplex_t *tmp_map, *tmp_map2;
-  tmp_map = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param_device.NumberPixels * param_device.NumberPixels);
-  tmp_map2 = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param_device.NumberPixels * param_device.NumberPixels);
+  tmp_map = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) *
+                                          param_device.NumberPixels *
+                                          param_device.NumberPixels);
+  tmp_map2 = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) *
+                                           param_device.NumberPixels *
+                                           param_device.NumberPixels);
   Alignment = 64;
 
-  fft_plan_c2c_forward = myfftw_plan_dft_2d(param_device.NumberPixels, param_device.NumberPixels, tmp_map, tmp_map2, FFTW_FORWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT);
-  fft_plan_c2c_backward = myfftw_plan_dft_2d(param_device.NumberPixels, param_device.NumberPixels, tmp_map, tmp_map2, FFTW_BACKWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT);
-  fft_plan_r2c_forward = myfftw_plan_dft_r2c_2d(param_device.NumberPixels, param_device.NumberPixels, (myfloat_t*) tmp_map, tmp_map2, FFTW_MEASURE | FFTW_DESTROY_INPUT);
-  fft_plan_c2r_backward = myfftw_plan_dft_c2r_2d(param_device.NumberPixels, param_device.NumberPixels, tmp_map, (myfloat_t*) tmp_map2, FFTW_MEASURE | FFTW_DESTROY_INPUT);
-
-  if (fft_plan_c2c_forward == 0 || fft_plan_c2c_backward == 0 || fft_plan_r2c_forward == 0 || fft_plan_c2r_backward == 0)
-    {
-      cout << "Error planing FFTs\n";
-      exit(1);
-    }
+  fft_plan_c2c_forward = myfftw_plan_dft_2d(
+      param_device.NumberPixels, param_device.NumberPixels, tmp_map, tmp_map2,
+      FFTW_FORWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT);
+  fft_plan_c2c_backward = myfftw_plan_dft_2d(
+      param_device.NumberPixels, param_device.NumberPixels, tmp_map, tmp_map2,
+      FFTW_BACKWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT);
+  fft_plan_r2c_forward = myfftw_plan_dft_r2c_2d(
+      param_device.NumberPixels, param_device.NumberPixels,
+      (myfloat_t *) tmp_map, tmp_map2, FFTW_MEASURE | FFTW_DESTROY_INPUT);
+  fft_plan_c2r_backward = myfftw_plan_dft_c2r_2d(
+      param_device.NumberPixels, param_device.NumberPixels, tmp_map,
+      (myfloat_t *) tmp_map2, FFTW_MEASURE | FFTW_DESTROY_INPUT);
+
+  if (fft_plan_c2c_forward == 0 || fft_plan_c2c_backward == 0 ||
+      fft_plan_r2c_forward == 0 || fft_plan_c2r_backward == 0)
+  {
+    cout << "Error planing FFTs\n";
+    exit(1);
+  }
 
   myfftw_free(tmp_map);
   myfftw_free(tmp_map2);
 
   const int count = omp_get_max_threads();
-  fft_scratch_complex = new mycomplex_t*[count];
-  fft_scratch_real = new myfloat_t*[count];
+  fft_scratch_complex = new mycomplex_t *[count];
+  fft_scratch_real = new myfloat_t *[count];
 #pragma omp parallel
   {
 #pragma omp critical
     {
       const int i = omp_get_thread_num();
-      fft_scratch_complex[i] = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param_device.NumberPixels * param_device.NumberFFTPixels1D);
-      fft_scratch_real[i] = (myfloat_t *) myfftw_malloc(sizeof(myfloat_t) * param_device.NumberPixels * param_device.NumberPixels);
+      fft_scratch_complex[i] = (mycomplex_t *) myfftw_malloc(
+          sizeof(mycomplex_t) * param_device.NumberPixels *
+          param_device.NumberFFTPixels1D);
+      fft_scratch_real[i] = (myfloat_t *) myfftw_malloc(
+          sizeof(myfloat_t) * param_device.NumberPixels *
+          param_device.NumberPixels);
     }
   }
 
   fft_plans_created = 1;
 }
 
-
 void bioem_param::releaseFFTPlans()
 {
   if (fft_plans_created)
+  {
+    const int count = omp_get_max_threads();
+    for (int i = 0; i < count; i++)
     {
-      const int count = omp_get_max_threads();
-      for (int i = 0;i < count;i++)
-	{
-	  myfftw_free(fft_scratch_complex[i]);
-	  myfftw_free(fft_scratch_real[i]);
-	}
-      delete[] fft_scratch_complex;
-      delete[] fft_scratch_real;
-
-      myfftw_destroy_plan(fft_plan_c2c_forward);
-      myfftw_destroy_plan(fft_plan_c2c_backward);
-      myfftw_destroy_plan(fft_plan_r2c_forward);
-      myfftw_destroy_plan(fft_plan_c2r_backward);
-      myfftw_cleanup();
+      myfftw_free(fft_scratch_complex[i]);
+      myfftw_free(fft_scratch_real[i]);
     }
+    delete[] fft_scratch_complex;
+    delete[] fft_scratch_real;
+
+    myfftw_destroy_plan(fft_plan_c2c_forward);
+    myfftw_destroy_plan(fft_plan_c2c_backward);
+    myfftw_destroy_plan(fft_plan_r2c_forward);
+    myfftw_destroy_plan(fft_plan_c2r_backward);
+    myfftw_cleanup();
+  }
   fft_plans_created = 0;
 }
 
-int bioem_param::CalculateGridsParam(const char* fileangles) //TO DO FOR QUATERNIONS
+int bioem_param::CalculateGridsParam(
+    const char *fileangles) // TO DO FOR QUATERNIONS
 {
   // **************************************************************************************
-  // **************** Routine that pre-calculates Orientation Grids**********************
+  // **************** Routine that pre-calculates Orientation
+  // Grids**********************
   // ************************************************************************************
 
-  
-  if(!doquater){	
+  if (!doquater)
+  {
 
     //*********** With Euler angles *******************
     cout << "Analysis Using Default Euler Angles\n";
-    if(!notuniformangles){
+    if (!notuniformangles)
+    {
 
-      if(yespriorAngles){
-	cout << "Error: This option is not valid with prior for orientations\nPlease provide separate file with orientations and priors";
-	exit(1);
+      if (yespriorAngles)
+      {
+        cout << "Error: This option is not valid with prior for "
+                "orientations\nPlease provide separate file with orientations "
+                "and priors";
+        exit(1);
       }
-    
+
       cout << "Calculating Grids in Euler Angles\n";
-    
+
       myfloat_t grid_alpha, cos_grid_beta;
       int n = 0;
 
-      //alpha and gamma are uniform in -PI,PI
+      // alpha and gamma are uniform in -PI,PI
       grid_alpha = 2.f * M_PI / (myfloat_t) angleGridPointsAlpha;
 
-      //cosine beta is uniform in -1,1
+      // cosine beta is uniform in -1,1
       cos_grid_beta = 2.f / (myfloat_t) angleGridPointsBeta;
 
       // Euler Angle Array
 
-      angles = (myfloat3_t*) mallocchk(  angleGridPointsAlpha * angleGridPointsBeta * angleGridPointsAlpha * sizeof(myfloat3_t));
-
-
-      for (int ialpha = 0; ialpha < angleGridPointsAlpha; ialpha ++)
-	{
-	  for (int ibeta = 0; ibeta < angleGridPointsBeta; ibeta ++)
-	    {
-	      for (int igamma = 0; igamma < angleGridPointsAlpha; igamma ++)
-		{
-		  angles[n].pos[0] = (myfloat_t) ialpha * grid_alpha - M_PI + grid_alpha * 0.5f; //ALPHA centered in the middle
-		  angles[n].pos[1] = acos((myfloat_t) ibeta * cos_grid_beta - 1 + cos_grid_beta * 0.5f); //BETA centered in the middle
-		  angles[n].pos[2] = (myfloat_t) igamma * grid_alpha - M_PI + grid_alpha * 0.5f; //GAMMA centered in the middle
-                  angles[n].quat4 =0.0;
-		  n++;
-		}
-	    }
-	}
-      nTotGridAngles = n;
-      voluang= grid_alpha * grid_alpha * cos_grid_beta / (2.f * M_PI) / (2.f * M_PI) / 2.f * priorMod;
+      angles =
+          (myfloat3_t *) mallocchk(angleGridPointsAlpha * angleGridPointsBeta *
+                                   angleGridPointsAlpha * sizeof(myfloat3_t));
 
-    } else{
+      for (int ialpha = 0; ialpha < angleGridPointsAlpha; ialpha++)
+      {
+        for (int ibeta = 0; ibeta < angleGridPointsBeta; ibeta++)
+        {
+          for (int igamma = 0; igamma < angleGridPointsAlpha; igamma++)
+          {
+            angles[n].pos[0] =
+                (myfloat_t) ialpha * grid_alpha - M_PI +
+                grid_alpha * 0.5f; // ALPHA centered in the middle
+            angles[n].pos[1] =
+                acos((myfloat_t) ibeta * cos_grid_beta - 1 +
+                     cos_grid_beta * 0.5f); // BETA centered in the middle
+            angles[n].pos[2] =
+                (myfloat_t) igamma * grid_alpha - M_PI +
+                grid_alpha * 0.5f; // GAMMA centered in the middle
+            angles[n].quat4 = 0.0;
+            n++;
+          }
+        }
+      }
+      nTotGridAngles = n;
+      voluang = grid_alpha * grid_alpha * cos_grid_beta / (2.f * M_PI) /
+                (2.f * M_PI) / 2.f * priorMod;
+    }
+    else
+    {
 
-      //************ Reading Euler Angles From File ************************** 
+      //************ Reading Euler Angles From File **************************
       ifstream input(fileangles);
 
       if (!input.good())
-	{
-	  cout << "Euler Angle File Failed to open file " <<  fileangles << " " << endl ;
-	  exit(1);
-	}
+      {
+        cout << "Euler Angle File Failed to open file " << fileangles << " "
+             << endl;
+        exit(1);
+      }
 
       char line[512] = {0};
       //      char saveline[512];
 
-      int n=0;
+      int n = 0;
 
       // First line tels the number of rows
       input.getline(line, 511);
 
-      char tmpVals[36]  = {0};
+      char tmpVals[36] = {0};
 
-      strncpy (tmpVals, line, 12);
-      sscanf (tmpVals, "%d", &NotUn_angles);
+      strncpy(tmpVals, line, 12);
+      sscanf(tmpVals, "%d", &NotUn_angles);
       cout << "Number of Euler angles " << NotUn_angles << "\n";
 
-
-      if(NotUn_angles<1) {
-	cout << "\nNot defined number of Euler angles in INPUT file:" << endl ;
-	//      cout << "Use key word: NOT_UNIFORM_TOTAL_ANGS\n";
-	exit(1);
+      if (NotUn_angles < 1)
+      {
+        cout << "\nNot defined number of Euler angles in INPUT file:" << endl;
+        //      cout << "Use key word: NOT_UNIFORM_TOTAL_ANGS\n";
+        exit(1);
       }
 
       // NotUn_angles=NotUn_angles+1;
 
+      angles = (myfloat3_t *) mallocchk(NotUn_angles * sizeof(myfloat3_t));
 
-      angles = (myfloat3_t*) mallocchk( NotUn_angles * sizeof(myfloat3_t));
-
-      if(yespriorAngles){
-	delete[] angprior;
-	angprior = new myfloat_t[NotUn_angles];
+      if (yespriorAngles)
+      {
+        delete[] angprior;
+        angprior = new myfloat_t[NotUn_angles];
       }
       while (!input.eof())
-	{
+      {
+
+        input.getline(line, 511);
+
+        if (n < NotUn_angles)
+        {
 
-	  input.getline(line, 511);
+          float a = 0., b = 0., g = 0., pp = 0.;
 
-	  if(n< NotUn_angles){
+          char tmpVals[60] = {0};
 
-            float a=0.,b=0.,g=0.,pp=0.;
-            
-	    char tmpVals[60] = {0};
+          strncpy(tmpVals, line, 12);
+          sscanf(tmpVals, "%f", &a);
 
-            strncpy (tmpVals, line, 12);
-	    sscanf (tmpVals, "%f", &a);
+          strncpy(tmpVals, line + 12, 12);
+          sscanf(tmpVals, "%f", &b);
 
-	    strncpy (tmpVals, line + 12, 12);
-	    sscanf (tmpVals, "%f", &b);
+          strncpy(tmpVals, line + 24, 12);
+          sscanf(tmpVals, "%f", &g);
 
-	    strncpy (tmpVals, line + 24, 12);
-	    sscanf (tmpVals, "%f", &g);
+          if (yespriorAngles)
+          {
+            strncpy(tmpVals, line + 36, 12);
+            sscanf(tmpVals, "%f", &pp);
+            if (pp < 0.0000001)
+              cout << "Sure you're input is correct? Very small prior.\n";
+            angprior[n] = (myfloat_t) pp;
+          }
 
-	    if(yespriorAngles){
-	      strncpy (tmpVals, line + 36, 12);
-	      sscanf (tmpVals, "%f", &pp);
-              if(pp <0.0000001)cout << "Sure you're input is correct? Very small prior.\n";
-	      angprior[n] = (myfloat_t) pp; 
-	    }
-            
-            angles[n].pos[0] = (myfloat_t) a;
-	    angles[n].pos[1] = (myfloat_t) b;
-	    angles[n].pos[2] = (myfloat_t) g;
-            angles[n].quat4 =0.0;//just to be sure */
+          angles[n].pos[0] = (myfloat_t) a;
+          angles[n].pos[1] = (myfloat_t) b;
+          angles[n].pos[2] = (myfloat_t) g;
+          angles[n].quat4 = 0.0; // just to be sure */
 #ifdef DEBUG
-//	    if(yespriorAngles) 
-cout << "check orient: " << n << " " << " " << angles[n].pos[0] << " " << angles[n].pos[1] << " " << angles[n].pos[2] << " prior:\n ";// << angprior[n]<< "\n";
+          //	    if(yespriorAngles)
+          cout << "check orient: " << n << " "
+               << " " << angles[n].pos[0] << " " << angles[n].pos[1] << " "
+               << angles[n].pos[2] << " prior:\n "; // << angprior[n]<< "\n";
 #endif
-	  }
-	  n++;
-	  if(NotUn_angles+1 < n) {
-	    cout << "Not properly defined total Euler angles " << n << " instead of " << NotUn_angles << "\n";
-	    exit(1);
-	  }
-	}
+        }
+        n++;
+        if (NotUn_angles + 1 < n)
+        {
+          cout << "Not properly defined total Euler angles " << n
+               << " instead of " << NotUn_angles << "\n";
+          exit(1);
+        }
+      }
       nTotGridAngles = NotUn_angles;
-      voluang= 1./ (myfloat_t) NotUn_angles * priorMod;
+      voluang = 1. / (myfloat_t) NotUn_angles * priorMod;
       input.close();
     }
-    
-  } else {
+  }
+  else
+  {
     //************** Analysis with Quaternions
 
-    if(!notuniformangles){
+    if (!notuniformangles)
+    {
       //************* Grid of Quaternions *******************
       cout << "Calculating Grids in Quaterions\n ";
 
-      if(yespriorAngles){
-	cout << "This option is not valid with prior for orientations\n It is necessary to provide a separate file with the angles and priors";
-	exit(1);
+      if (yespriorAngles)
+      {
+        cout << "This option is not valid with prior for orientations\n It is "
+                "necessary to provide a separate file with the angles and "
+                "priors";
+        exit(1);
       }
 
-      if (GridPointsQuatern < 0 ) { cout << "*** Missing Gridpoints Quaternions \n after QUATERNIONS (int)\n (int)=Number of gridpoins per dimension"; exit(1);}  
+      if (GridPointsQuatern < 0)
+      {
+        cout << "*** Missing Gridpoints Quaternions \n after QUATERNIONS "
+                "(int)\n (int)=Number of gridpoins per dimension";
+        exit(1);
+      }
 
-      myfloat_t dgridq,q1,q2,q3;
-      int n=0;
+      myfloat_t dgridq, q1, q2, q3;
+      int n = 0;
 
-      dgridq=2.f/(myfloat_t) (GridPointsQuatern +1);
+      dgridq = 2.f / (myfloat_t)(GridPointsQuatern + 1);
 
       // loop to calculate the number ofpoints in the quaternion shpere  rad < 1
-      for (int ialpha = 0; ialpha < GridPointsQuatern + 1 ; ialpha ++)
-	{
-	  q1=(myfloat_t) ialpha * dgridq -1.f + 0.5 * dgridq;
-	  for (int ibeta = 0; ibeta < GridPointsQuatern + 1 ; ibeta ++)
-	    {
-	      q2=(myfloat_t) ibeta * dgridq -1.f + 0.5 * dgridq;
-	      for (int igamma = 0; igamma < GridPointsQuatern + 1; igamma ++)
-		{
-		  q3= (myfloat_t) igamma * dgridq -1.f + 0.5 * dgridq;
-		  if(q1*q1+q2*q2+q3*q3 <= 1.f)n=n+2;
-		
-		}
-	    }
-	}
-
-      //allocating angles
+      for (int ialpha = 0; ialpha < GridPointsQuatern + 1; ialpha++)
+      {
+        q1 = (myfloat_t) ialpha * dgridq - 1.f + 0.5 * dgridq;
+        for (int ibeta = 0; ibeta < GridPointsQuatern + 1; ibeta++)
+        {
+          q2 = (myfloat_t) ibeta * dgridq - 1.f + 0.5 * dgridq;
+          for (int igamma = 0; igamma < GridPointsQuatern + 1; igamma++)
+          {
+            q3 = (myfloat_t) igamma * dgridq - 1.f + 0.5 * dgridq;
+            if (q1 * q1 + q2 * q2 + q3 * q3 <= 1.f)
+              n = n + 2;
+          }
+        }
+      }
+
+      // allocating angles
 
       nTotGridAngles = n;
 
-      angles = (myfloat3_t*) mallocchk( nTotGridAngles * sizeof(myfloat3_t));
- 
-      voluang= dgridq * dgridq * dgridq * priorMod;
+      angles = (myfloat3_t *) mallocchk(nTotGridAngles * sizeof(myfloat3_t));
+
+      voluang = dgridq * dgridq * dgridq * priorMod;
 
-      n=0;
+      n = 0;
       // assigning values
-      for (int ialpha = 0; ialpha < GridPointsQuatern + 1; ialpha ++)
-	{
-	  q1=(myfloat_t) ialpha * dgridq -1.f + 0.5 * dgridq;
-	  for (int ibeta = 0; ibeta < GridPointsQuatern + 1; ibeta ++)
-	    {
-	      q2=(myfloat_t) ibeta * dgridq -1.f + 0.5 * dgridq;
-	      for (int igamma = 0; igamma < GridPointsQuatern + 1 ; igamma ++)
-		{
-		  q3= (myfloat_t) igamma * dgridq -1.f + 0.5 * dgridq;
-		  if(q1*q1+q2*q2+q3*q3 <= 1.f){
-		
-		    angles[n].pos[0] = q1; 
-		    angles[n].pos[1] = q2;
-		    angles[n].pos[2] = q3;
-		    angles[n].quat4=sqrt(1.f-q1*q1-q2*q2-q3*q3);
-		    n++;
-		    //Adding the negative
-		    angles[n].pos[0] = q1;
-		    angles[n].pos[1] = q2;
-		    angles[n].pos[2] = q3;
-		    angles[n].quat4=-sqrt(1.f-q1*q1-q2*q2-q3*q3);
-		    n++;
-		  }
-		}
-	    }
-	}
-	
-    } else{
+      for (int ialpha = 0; ialpha < GridPointsQuatern + 1; ialpha++)
+      {
+        q1 = (myfloat_t) ialpha * dgridq - 1.f + 0.5 * dgridq;
+        for (int ibeta = 0; ibeta < GridPointsQuatern + 1; ibeta++)
+        {
+          q2 = (myfloat_t) ibeta * dgridq - 1.f + 0.5 * dgridq;
+          for (int igamma = 0; igamma < GridPointsQuatern + 1; igamma++)
+          {
+            q3 = (myfloat_t) igamma * dgridq - 1.f + 0.5 * dgridq;
+            if (q1 * q1 + q2 * q2 + q3 * q3 <= 1.f)
+            {
+
+              angles[n].pos[0] = q1;
+              angles[n].pos[1] = q2;
+              angles[n].pos[2] = q3;
+              angles[n].quat4 = sqrt(1.f - q1 * q1 - q2 * q2 - q3 * q3);
+              n++;
+              // Adding the negative
+              angles[n].pos[0] = q1;
+              angles[n].pos[1] = q2;
+              angles[n].pos[2] = q3;
+              angles[n].quat4 = -sqrt(1.f - q1 * q1 - q2 * q2 - q3 * q3);
+              n++;
+            }
+          }
+        }
+      }
+    }
+    else
+    {
 
       //******** Reading Quaternions From a File ***************************
       ifstream input(fileangles);
 
       if (!input.good())
-	{
-	  cout << "Problem with Quaterion List file " <<  fileangles << " " << endl ;
-	  exit(1);
-	}
+      {
+        cout << "Problem with Quaterion List file " << fileangles << " "
+             << endl;
+        exit(1);
+      }
 
       char line[512] = {0};
-      int n=0;
+      int n = 0;
 
       // First line tels the number of rows
       input.getline(line, 511);
       int ntotquat;
 
-      char tmpVals[60]  = {0};
+      char tmpVals[60] = {0};
 
-      strncpy (tmpVals, line, 12);
-      sscanf (tmpVals, "%d", &ntotquat);
-      if(ntotquat <1){
-        cout << "Invalid Number of quaternions " << ntotquat << "\n"; exit(1);
-      }else{
-	cout << "Number of quaternions " << ntotquat << "\n";
+      strncpy(tmpVals, line, 12);
+      sscanf(tmpVals, "%d", &ntotquat);
+      if (ntotquat < 1)
+      {
+        cout << "Invalid Number of quaternions " << ntotquat << "\n";
+        exit(1);
+      }
+      else
+      {
+        cout << "Number of quaternions " << ntotquat << "\n";
       }
-      angles = (myfloat3_t*) mallocchk( ntotquat * sizeof(myfloat3_t));
+      angles = (myfloat3_t *) mallocchk(ntotquat * sizeof(myfloat3_t));
       //    delete[] angles;
       //    angles = new myfloat3_t[ ntotquat] ;
 
-      if(yespriorAngles){
-	delete[] angprior;
-	angprior = new myfloat_t[ ntotquat ];
-      }      
+      if (yespriorAngles)
+      {
+        delete[] angprior;
+        angprior = new myfloat_t[ntotquat];
+      }
       while (!input.eof())
-	{
-	  input.getline(line, 511);
-	  if(n< ntotquat){
-	    myfloat_t q1,q2,q3,q4,pp;
-
-	    q1=-99999; q2=-99999;q3=-99999;q4=-99999;
-	    char tmpVals[60]  = {0};
-
-	    strncpy (tmpVals, line, 12);
-	    sscanf (tmpVals, "%f", &q1);
-
-	    strncpy (tmpVals, line + 12, 12);
-	    sscanf (tmpVals, "%f", &q2);
-
-	    strncpy (tmpVals, line + 24, 12);
-	    sscanf (tmpVals, "%f", &q3);
-	  
-	    strncpy (tmpVals, line + 36, 12);
-	    sscanf (tmpVals, "%f", &q4);
-
-	    angles[n].pos[0] = q1;
-	    angles[n].pos[1] = q2;
-	    angles[n].pos[2] = q3;
-	    angles[n].quat4 = q4;
-
-	    if(q1<-1 || q1 > 1){ cout << "Error reading quaterions from list. Value out of range "  << q1 << " row " << n << "\n"; exit(1);};
-	    if(q2<-1 || q2 > 1){ cout << "Error reading quaterions from list. Value out of range "  << q2 << " row " << n << "\n"; exit(1);};
-	    if(q3<-1 || q3 > 1){ cout << "Error reading quaterions from list. Value out of range "  << q3 << " row " << n << "\n"; exit(1);};
-	    if(q4<-1 || q4 > 1){ cout << "Error reading quaterions from list. Value out of range "  << q4 << " row " << n << "\n"; exit(1);};
-
-
-	    if(yespriorAngles){
-	      strncpy (tmpVals, line + 48, 12);
-	      sscanf (tmpVals, "%f", &pp);
-              if(pp <0.0000001)cout << "Sure you're input is correct? Very small prior.\n";
-	      angprior[n] = pp;}
+      {
+        input.getline(line, 511);
+        if (n < ntotquat)
+        {
+          float q1, q2, q3, q4, pp;
+
+          q1 = -99999.;
+          q2 = -99999.;
+          q3 = -99999.;
+          q4 = -99999.;
+          char tmpVals[60] = {0};
+
+          strncpy(tmpVals, line, 12);
+          sscanf(tmpVals, "%f", &q1);
+
+          strncpy(tmpVals, line + 12, 12);
+          sscanf(tmpVals, "%f", &q2);
+
+          strncpy(tmpVals, line + 24, 12);
+          sscanf(tmpVals, "%f", &q3);
+
+          strncpy(tmpVals, line + 36, 12);
+          sscanf(tmpVals, "%f", &q4);
+
+          angles[n].pos[0] = q1;
+          angles[n].pos[1] = q2;
+          angles[n].pos[2] = q3;
+          angles[n].quat4 = q4;
+
+          if (q1 < -1 || q1 > 1)
+          {
+            cout << "Error reading quaterions from list. Value out of range "
+                 << q1 << " row " << n << "\n";
+            exit(1);
+          };
+          if (q2 < -1 || q2 > 1)
+          {
+            cout << "Error reading quaterions from list. Value out of range "
+                 << q2 << " row " << n << "\n";
+            exit(1);
+          };
+          if (q3 < -1 || q3 > 1)
+          {
+            cout << "Error reading quaterions from list. Value out of range "
+                 << q3 << " row " << n << "\n";
+            exit(1);
+          };
+          if (q4 < -1 || q4 > 1)
+          {
+            cout << "Error reading quaterions from list. Value out of range "
+                 << q4 << " row " << n << "\n";
+            exit(1);
+          };
+
+          if (yespriorAngles)
+          {
+            strncpy(tmpVals, line + 48, 12);
+            sscanf(tmpVals, "%f", &pp);
+            if (pp < 0.0000001)
+              cout << "Sure you're input is correct? Very small prior.\n";
+            angprior[n] = pp;
+          }
 #ifdef DEBUG
-	//    if(yespriorAngles) 
-        cout << "check orient: " << n << " "  << angles[n].pos[0] << " " << angles[n].pos[1] << " " << angles[n].pos[2] << " prior: " << angles[n].quat4 << "\n";
+          //    if(yespriorAngles)
+          cout << "check orient: " << n << " " << angles[n].pos[0] << " "
+               << angles[n].pos[1] << " " << angles[n].pos[2]
+               << " prior: " << angles[n].quat4 << "\n";
 #endif
-	    
-	  }
-	  n++;
-	  if(ntotquat+1 < n) {
-	    cout << "More quaternions than expected in header " << n << " instead of " << NotUn_angles << "\n";
-	    exit(1);
-	  }
-	}
+        }
+        n++;
+        if (ntotquat + 1 < n)
+        {
+          cout << "More quaternions than expected in header " << n
+               << " instead of " << NotUn_angles << "\n";
+          exit(1);
+        }
+      }
       nTotGridAngles = ntotquat;
-      voluang= 1./ (myfloat_t) ntotquat * priorMod;
+      voluang = 1. / (myfloat_t) ntotquat * priorMod;
 
-      input.close();   
+      input.close();
     }
 
-
-    cout << "Analysis with Quaternions. Total number of quaternions " << nTotGridAngles << "\n";
+    cout << "Analysis with Quaternions. Total number of quaternions "
+         << nTotGridAngles << "\n";
   }
 
-
-  return(0);
-
+  return (0);
 }
 
 int bioem_param::CalculateRefCTF()
 {
   // **************************************************************************************
-  // ********** Routine that pre-calculates Kernels for Convolution **********************
+  // ********** Routine that pre-calculates Kernels for Convolution
+  // **********************
   // ************************************************************************************
 
   myfloat_t amp, env, phase, ctf, radsq;
-  myfloat_t* localCTF;
-  mycomplex_t* localout;
+  myfloat_t *localCTF;
+  mycomplex_t *localout;
   int nctfmax = param_device.NumberPixels / 2;
   int n = 0;
 
-  localCTF = (myfloat_t *) myfftw_malloc(sizeof(myfloat_t) * param_device.NumberPixels * param_device.NumberPixels);
-  localout = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) * param_device.NumberPixels * param_device.NumberFFTPixels1D);
+  localCTF = (myfloat_t *) myfftw_malloc(sizeof(myfloat_t) *
+                                         param_device.NumberPixels *
+                                         param_device.NumberPixels);
+  localout = (mycomplex_t *) myfftw_malloc(sizeof(mycomplex_t) *
+                                           param_device.NumberPixels *
+                                           param_device.NumberFFTPixels1D);
 
-  nTotCTFs = numberGridPointsCTF_amp * numberGridPointsCTF_phase * numberGridPointsEnvelop;
+  nTotCTFs = numberGridPointsCTF_amp * numberGridPointsCTF_phase *
+             numberGridPointsEnvelop;
   delete[] refCTF;
   refCTF = new mycomplex_t[getRefCtfCount()];
   delete[] CtfParam;
@@ -1136,202 +1463,265 @@ int bioem_param::CalculateRefCTF()
 
   myfloat_t normctf;
 
- 
-  gridCTF_amp = (endGridCTF_amp - startGridCTF_amp) / (myfloat_t) numberGridPointsCTF_amp;
-  gridCTF_phase = (endGridCTF_phase - startGridCTF_phase) / (myfloat_t) numberGridPointsCTF_phase;
-  gridEnvelop =  (endGridEnvelop - startGridEnvelop) / (myfloat_t) numberGridPointsEnvelop; 
-
+  gridCTF_amp =
+      (endGridCTF_amp - startGridCTF_amp) / (myfloat_t) numberGridPointsCTF_amp;
+  gridCTF_phase = (endGridCTF_phase - startGridCTF_phase) /
+                  (myfloat_t) numberGridPointsCTF_phase;
+  gridEnvelop =
+      (endGridEnvelop - startGridEnvelop) / (myfloat_t) numberGridPointsEnvelop;
 
-  //if only one grid point for PSF kernel:
-  if( (myfloat_t) numberGridPointsCTF_amp == 1 ) { 
-    gridCTF_amp = startGridCTF_amp;}
-  else if ( (endGridCTF_amp - startGridCTF_amp) < 0. ){
-    cout << "Error: Interval of amplitude in CTF/PSF Negative"; exit(1);
+  // if only one grid point for PSF kernel:
+  if ((myfloat_t) numberGridPointsCTF_amp == 1)
+  {
+    gridCTF_amp = startGridCTF_amp;
   }
-  if( (myfloat_t) numberGridPointsCTF_phase == 1 ) {
+  else if ((endGridCTF_amp - startGridCTF_amp) < 0.)
+  {
+    cout << "Error: Interval of amplitude in CTF/PSF Negative";
+    exit(1);
+  }
+  if ((myfloat_t) numberGridPointsCTF_phase == 1)
+  {
     gridCTF_phase = startGridCTF_phase;
-  }else if ( (endGridCTF_phase - startGridCTF_phase) < 0.){
-    cout << "Error: Interval of PHASE in CTF/PSF is Negative"; exit(1);
   }
-  if( (myfloat_t) numberGridPointsEnvelop == 1 ) {
+  else if ((endGridCTF_phase - startGridCTF_phase) < 0.)
+  {
+    cout << "Error: Interval of PHASE in CTF/PSF is Negative";
+    exit(1);
+  }
+  if ((myfloat_t) numberGridPointsEnvelop == 1)
+  {
     gridEnvelop = startGridEnvelop;
-  } else if ( (endGridEnvelop - startGridEnvelop) < 0.) {
-    cout << "Error: Interval of Envelope in CTF/PSF is Negative"; exit(1);
+  }
+  else if ((endGridEnvelop - startGridEnvelop) < 0.)
+  {
+    cout << "Error: Interval of Envelope in CTF/PSF is Negative";
+    exit(1);
   }
 
-  	
-  //More checks with input parameters
+  // More checks with input parameters
   // Envelope should not have a standard deviation greater than Npix/2
-  if(sqrt(1./( (myfloat_t) numberGridPointsEnvelop  * gridEnvelop + startGridEnvelop))> float(param_device.NumberPixels)/2.0 && usepsf) {
-    cout << "MAX Standard deviation of envelope is larger than Allowed KERNEL Length\n";
+  if (sqrt(1. / ((myfloat_t) numberGridPointsEnvelop * gridEnvelop +
+                 startGridEnvelop)) > float(param_device.NumberPixels) / 2.0 &&
+      usepsf)
+  {
+    cout << "MAX Standard deviation of envelope is larger than Allowed KERNEL "
+            "Length\n";
     exit(1);
   }
   // Envelop param should be positive
-  if(!printModel && (startGridCTF_amp < 0 || endGridCTF_amp > 1)){
+  if (!printModel && (startGridCTF_amp < 0 || endGridCTF_amp > 1))
+  {
     cout << "Error: PSF Amplitud should be between 0 and 1\n";
     cout << "start: " << startGridCTF_amp << "End: " << endGridCTF_amp << "\n";
     exit(1);
   }
 
-  if(!printModel && endGridCTF_amp < startGridCTF_amp){
-    cout << "Error: values of amplitud starting is larger than ending points\n" ;
+  if (!printModel && endGridCTF_amp < startGridCTF_amp)
+  {
+    cout << "Error: values of amplitud starting is larger than ending points\n";
     cout << "start: " << startGridCTF_amp << " End: " << endGridCTF_amp << "\n";
     exit(1);
   }
 
+  for (int iamp = 0; iamp < numberGridPointsCTF_amp;
+       iamp++) // Loop over amplitud
+  {
+    amp = (myfloat_t) iamp * gridCTF_amp + startGridCTF_amp;
 
-  for (int iamp = 0; iamp <  numberGridPointsCTF_amp ; iamp++) //Loop over amplitud
-    {
-      amp = (myfloat_t) iamp * gridCTF_amp + startGridCTF_amp;
-
-      for (int iphase = 0; iphase <  numberGridPointsCTF_phase ; iphase++)//Loop over phase
-	{
-	  phase = (myfloat_t) iphase * gridCTF_phase + startGridCTF_phase;
-
-	  for (int ienv = 0; ienv <  numberGridPointsEnvelop ; ienv++)//Loop over envelope
-	    {
-	      env = (myfloat_t) ienv * gridEnvelop + startGridEnvelop;
-
-	      memset(localCTF, 0, param_device.NumberPixels * param_device.NumberPixels * sizeof(myfloat_t));
-
-	      normctf = 0.0;
-
-	      //	      cout <<"values " << amp << " " << phase << " " << env <<"\n";
-	      //Complex CTF
-	      mycomplex_t* curRef = &refCTF[n * FFTMapSize];
-
-	      // Initialzing everything to zero just to be sure
-	      for(int i = 0; i < param_device.NumberPixels * param_device.NumberFFTPixels1D; i++ ){
-                curRef[i][0] =0.f;
-                curRef[i][1] =0.f;
-	      }
-
-	      for(int i = 0; i < param_device.NumberPixels; i++)
-		{
-		  for(int j = 0; j < param_device.NumberPixels; j++)
-		    {
-		      localCTF[i * param_device.NumberPixels + j]=0.f;
-		    }
-		}
-
-	      if(usepsf){
-		normctf=0.0;
-                   
-		for(int i = 0; i < param_device.NumberPixels; i++)
-                  {
-                    for(int j = 0; j < param_device.NumberPixels; j++)
-                      {
-			int ri=0,rj=0;
-
-			//Calculating the distance from the periodic center at 0,0
-
-			if(i<nctfmax+1){ ri=i; }else{ ri=param_device.NumberPixels-i;};
-			if(j<nctfmax+1){ rj=j; }else{ rj=param_device.NumberPixels-j;};
-			radsq = (myfloat_t) ((ri) * (ri) + (rj) *(rj)) * pixelSize * pixelSize;
-
-                        ctf = exp(-radsq * env / 2.0) * (- amp * cos(radsq * phase / 2.0) - sqrt((1 - amp * amp)) * sin(radsq * phase / 2.0)) ;
-
-                        localCTF[i * param_device.NumberPixels + j] = (myfloat_t) ctf;
-
-			normctf += localCTF[i * param_device.NumberPixels + j];
+    for (int iphase = 0; iphase < numberGridPointsCTF_phase;
+         iphase++) // Loop over phase
+    {
+      phase = (myfloat_t) iphase * gridCTF_phase + startGridCTF_phase;
 
-			//	    cout << "TT " << i << " " << j << " " << localCTF[i * param_device.NumberPixels + j]  << "\n";
-		      }
-		  }
+      for (int ienv = 0; ienv < numberGridPointsEnvelop;
+           ienv++) // Loop over envelope
+      {
+        env = (myfloat_t) ienv * gridEnvelop + startGridEnvelop;
 
-		//Normalization
-		for(int i = 0; i < param_device.NumberPixels; i++)
-                  {
-                    for(int j = 0; j < param_device.NumberPixels; j++)
-                      {
-                        localCTF[i * param_device.NumberPixels + j]= localCTF[i * param_device.NumberPixels + j]/normctf;
-		      }
-		  }
+        memset(localCTF, 0, param_device.NumberPixels *
+                                param_device.NumberPixels * sizeof(myfloat_t));
 
-		
-      
-		//Calling FFT_Forward
-		myfftw_execute_dft_r2c(fft_plan_r2c_forward, localCTF, localout);
+        normctf = 0.0;
 
-		// Saving the Reference PSFs
+        //	      cout <<"values " << amp << " " << phase << " " << env
+        //<<"\n";
+        // Complex CTF
+        mycomplex_t *curRef = &refCTF[n * FFTMapSize];
 
-		for(int i = 0; i < param_device.NumberPixels * param_device.NumberFFTPixels1D; i++ )
-		  {
-		    curRef[i][0] = localout[i][0];
-		    curRef[i][1] = localout[i][1];
-		    // cout << "PSFFOU " << i << " " << curRef[i][0] << " " <<  curRef[i][1] << " " << param_device.NumberFFTPixels1D << " " << FFTMapSize <<"\n";
-		  }
+        // Initialzing everything to zero just to be sure
+        for (int i = 0;
+             i < param_device.NumberPixels * param_device.NumberFFTPixels1D;
+             i++)
+        {
+          curRef[i][0] = 0.f;
+          curRef[i][1] = 0.f;
+        }
 
-	      }else{
+        for (int i = 0; i < param_device.NumberPixels; i++)
+        {
+          for (int j = 0; j < param_device.NumberPixels; j++)
+          {
+            localCTF[i * param_device.NumberPixels + j] = 0.f;
+          }
+        }
 
-		//*******CTF*************
-		normctf = 0.0;
+        if (usepsf)
+        {
+          normctf = 0.0;
+
+          for (int i = 0; i < param_device.NumberPixels; i++)
+          {
+            for (int j = 0; j < param_device.NumberPixels; j++)
+            {
+              int ri = 0, rj = 0;
+
+              // Calculating the distance from the periodic center at 0,0
+
+              if (i < nctfmax + 1)
+              {
+                ri = i;
+              }
+              else
+              {
+                ri = param_device.NumberPixels - i;
+              };
+              if (j < nctfmax + 1)
+              {
+                rj = j;
+              }
+              else
+              {
+                rj = param_device.NumberPixels - j;
+              };
+              radsq = (myfloat_t)((ri) * (ri) + (rj) * (rj)) * pixelSize *
+                      pixelSize;
+
+              ctf = exp(-radsq * env / 2.0) *
+                    (-amp * cos(radsq * phase / 2.0) -
+                     sqrt((1 - amp * amp)) * sin(radsq * phase / 2.0));
+
+              localCTF[i * param_device.NumberPixels + j] = (myfloat_t) ctf;
+
+              normctf += localCTF[i * param_device.NumberPixels + j];
+
+              //	    cout << "TT " << i << " " << j << " " << localCTF[i
+              //* param_device.NumberPixels + j]  << "\n";
+            }
+          }
+
+          // Normalization
+          for (int i = 0; i < param_device.NumberPixels; i++)
+          {
+            for (int j = 0; j < param_device.NumberPixels; j++)
+            {
+              localCTF[i * param_device.NumberPixels + j] =
+                  localCTF[i * param_device.NumberPixels + j] / normctf;
+            }
+          }
+
+          // Calling FFT_Forward
+          myfftw_execute_dft_r2c(fft_plan_r2c_forward, localCTF, localout);
+
+          // Saving the Reference PSFs
+
+          for (int i = 0;
+               i < param_device.NumberPixels * param_device.NumberFFTPixels1D;
+               i++)
+          {
+            curRef[i][0] = localout[i][0];
+            curRef[i][1] = localout[i][1];
+            // cout << "PSFFOU " << i << " " << curRef[i][0] << " " <<
+            // curRef[i][1] << " " << param_device.NumberFFTPixels1D << " " <<
+            // FFTMapSize <<"\n";
+          }
+        }
+        else
+        {
 
-		if(amp <0.0000000001){
-		  cout << "Problem with CTF normalization AMP less than threshold < 10^-10 \n";
-		  exit(1);  
-		} 
+          //*******CTF*************
+          normctf = 0.0;
+
+          if (amp < 0.0000000001)
+          {
+            cout << "Problem with CTF normalization AMP less than threshold < "
+                    "10^-10 \n";
+            exit(1);
+          }
+
+          // Directly calculating CTF IN FOURIER SPACE
+          for (int i = 0; i < param_device.NumberFFTPixels1D; i++)
+          {
+            for (int j = 0; j < param_device.NumberFFTPixels1D; j++)
+            {
+              radsq = (myfloat_t)(i * i + j * j) / param_device.NumberPixels /
+                      param_device.NumberPixels / pixelSize / pixelSize;
+              ctf = exp(-env * radsq / 2.) *
+                    (-amp * cos(phase * radsq / 2.) -
+                     sqrt((1 - amp * amp)) * sin(phase * radsq / 2.));
+              if (i == 0 && j == 0)
+                normctf =
+                    (myfloat_t) ctf; // component 0 0 should be the norm in 1d
+              curRef[i * param_device.NumberFFTPixels1D + j][0] = ctf / normctf;
+              curRef[i * param_device.NumberFFTPixels1D + j][1] = 0;
+              // On symmetric side
+              curRef[(param_device.NumberPixels - i - 1) *
+                         param_device.NumberFFTPixels1D +
+                     j][0] = ctf / normctf;
+              curRef[(param_device.NumberPixels - i - 1) *
+                         param_device.NumberFFTPixels1D +
+                     j][1] = 0;
+            }
+          }
+
+          //		 for(int i = 0; i < param_device.NumberPixels *
+          // param_device.NumberFFTPixels1D; i++ )curRef[i][0]/= normctf;
+        }
 
-		//Directly calculating CTF IN FOURIER SPACE
-		for(int i = 0; i < param_device.NumberFFTPixels1D ; i++ )
-		  {
-		    for(int j = 0; j < param_device.NumberFFTPixels1D; j++ )
-		      {
-			radsq = (myfloat_t) (i * i + j * j) / param_device.NumberPixels / param_device.NumberPixels / pixelSize / pixelSize ;
-			ctf = exp(- env * radsq / 2.) * ( -amp * cos( phase * radsq / 2.) - sqrt((1 - amp * amp)) * sin( phase * radsq / 2.));	
-			if( i==0 && j==0 ) normctf = (myfloat_t) ctf; // component 0 0 should be the norm in 1d
-			curRef[i * param_device.NumberFFTPixels1D + j ][0] = ctf / normctf;
-			curRef[i * param_device.NumberFFTPixels1D + j ][1] = 0; 
-			//On symmetric side 
-			curRef[ (param_device.NumberPixels - i - 1) * param_device.NumberFFTPixels1D + j ][0] = ctf / normctf;
-			curRef[ (param_device.NumberPixels - i - 1) * param_device.NumberFFTPixels1D + j ][1] = 0;
-		      }
-		  }
-
-
-		//		 for(int i = 0; i < param_device.NumberPixels * param_device.NumberFFTPixels1D; i++ )curRef[i][0]/= normctf;
-	      }
-	
-
-	      CtfParam[n].pos[0] = amp;
-	      CtfParam[n].pos[1] = phase;
-	      CtfParam[n].pos[2] = env;
-	      n++;
-	      //exit(1);
-	    }
-	}
+        CtfParam[n].pos[0] = amp;
+        CtfParam[n].pos[1] = phase;
+        CtfParam[n].pos[2] = env;
+        n++;
+        // exit(1);
+      }
     }
-
+  }
 
   myfftw_free(localCTF);
   myfftw_free(localout);
   if (nTotCTFs != n)
-    {
-      cout << "Internal error during CTF preparation\n";
-      exit(1);
-    }
-
+  {
+    cout << "Internal error during CTF preparation\n";
+    exit(1);
+  }
 
   // ********** Calculating normalized volumen element *********
 
-  if(!printModel){
-  // All priors (uniform or not) normalized to 1
-  // The volume is the grid-spacing of the parameter / normalization
-  // the order is angles, displacement, ctf amplitud (all uniform) then env b & phase (non uniform) the sqrt(2) cancel out (see SI)
-    param_device.volu = voluang * 
-		 (myfloat_t) param_device.GridSpaceCenter * pixelSize * (myfloat_t) param_device.GridSpaceCenter * pixelSize / ((2.f * (myfloat_t) param_device.maxDisplaceCenter+1.)) / (2.f * (myfloat_t) (param_device.maxDisplaceCenter + 1.)) 
-                / (myfloat_t) numberGridPointsCTF_amp 
-		* gridEnvelop * gridCTF_phase / M_PI  / param_device.sigmaPriorbctf /  param_device.sigmaPriordefo ; 
+  if (!printModel)
+  {
+    // All priors (uniform or not) normalized to 1
+    // The volume is the grid-spacing of the parameter / normalization
+    // the order is angles, displacement, ctf amplitud (all uniform) then env b
+    // & phase (non uniform) the sqrt(2) cancel out (see SI)
+    param_device.volu =
+        voluang * (myfloat_t) param_device.GridSpaceCenter * pixelSize *
+        (myfloat_t) param_device.GridSpaceCenter * pixelSize /
+        ((2.f * (myfloat_t) param_device.maxDisplaceCenter + 1.)) /
+        (2.f * (myfloat_t)(param_device.maxDisplaceCenter + 1.)) /
+        (myfloat_t) numberGridPointsCTF_amp * gridEnvelop * gridCTF_phase /
+        M_PI / param_device.sigmaPriorbctf / param_device.sigmaPriordefo;
 
     //  cout << "VOLU " << param_device.volu  << " " << gridCTF_amp << "\n";
     // *** Number of total pixels***
 
-    param_device.Ntotpi = (myfloat_t) (param_device.NumberPixels * param_device.NumberPixels);
-    param_device.NtotDist = (2 * (int) (param_device.maxDisplaceCenter / param_device.GridSpaceCenter) + 1 ) * (2 * (int) (param_device.maxDisplaceCenter / param_device.GridSpaceCenter) + 1);
-
+    param_device.Ntotpi =
+        (myfloat_t)(param_device.NumberPixels * param_device.NumberPixels);
+    param_device.NxDisp = 2 * (int) (param_device.maxDisplaceCenter /
+                                     param_device.GridSpaceCenter) +
+                          1;
+    param_device.NtotDisp = param_device.NxDisp * param_device.NxDisp;
   }
-  nTotCC = (int) ((myfloat_t) param_device.NumberPixels / (myfloat_t) param_device.CCdisplace + 1) * (int) ((myfloat_t) param_device.NumberPixels / (myfloat_t) param_device.CCdisplace + 1);  
-  return(0);
+  return (0);
 }
 
 bioem_param::~bioem_param()
@@ -1345,11 +1735,15 @@ bioem_param::~bioem_param()
   numberGridPointsCTF_phase = 0;
   param_device.maxDisplaceCenter = 0;
   numberGridPointsDisplaceCenter = 0;
-  if (refCTF) delete[] refCTF;
-  if (CtfParam) delete[] CtfParam;
-  if (angles) free(angles);
-  if(angprior) delete[] angprior;
-  refCTF= NULL;
+  if (refCTF)
+    delete[] refCTF;
+  if (CtfParam)
+    delete[] CtfParam;
+  if (angles)
+    free(angles);
+  if (angprior)
+    delete[] angprior;
+  refCTF = NULL;
   CtfParam = NULL;
   angles = NULL;
   angprior = NULL;
diff --git a/timer.cpp b/timer.cpp
index c02c0a0763ee38c870dfb996da964b747550f219..d58ff05340b774e67deee765d6a03bf6de315117 100644
--- a/timer.cpp
+++ b/timer.cpp
@@ -1,15 +1,29 @@
+/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+   < BioEM software for Bayesian inference of Electron Microscopy images>
+   Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
+        Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
+   Max Planck Institute of Biophysics, Frankfurt, Germany.
+   Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
+   Germany.
+   Max Planck Computing and Data Facility, Garching, Germany.
+
+   Released under the GNU Public License, v3.
+   See license statement for terms of distribution.
+
+   ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
 #include "timer.h"
 #ifdef _WIN32
-#include <windows.h>
 #include <winbase.h>
+#include <windows.h>
 #else
 #include <time.h>
 #endif
 
 HighResTimer::HighResTimer()
 {
-	ElapsedTime = 0;
-	running = 0;
+  ElapsedTime = 0;
+  running = 0;
 }
 
 HighResTimer::~HighResTimer() {}
@@ -17,76 +31,75 @@ HighResTimer::~HighResTimer() {}
 void HighResTimer::Start()
 {
 #ifdef _WIN32
-	__int64 istart;
-	QueryPerformanceCounter((LARGE_INTEGER*)&istart);
-	StartTime = (double) istart;
+  __int64 istart;
+  QueryPerformanceCounter((LARGE_INTEGER *) &istart);
+  StartTime = (double) istart;
 #else
-	timespec tv;
-	clock_gettime(CLOCK_REALTIME, &tv);
-	StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
+  timespec tv;
+  clock_gettime(CLOCK_REALTIME, &tv);
+  StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
 #endif
-	running = 1;
+  running = 1;
 }
 
 void HighResTimer::ResetStart()
 {
-	ElapsedTime = 0;
-	Start();
+  ElapsedTime = 0;
+  Start();
 }
 
 void HighResTimer::Stop()
 {
-	if (running == 0) return;
-	running = 0;
-	double EndTime = 0;
+  if (running == 0)
+    return;
+  running = 0;
+  double EndTime = 0;
 #ifdef _WIN32
-	__int64 iend;
-	QueryPerformanceCounter((LARGE_INTEGER*) &iend);
-	EndTime = (double) iend;
+  __int64 iend;
+  QueryPerformanceCounter((LARGE_INTEGER *) &iend);
+  EndTime = (double) iend;
 #else
-	timespec tv;
-	clock_gettime(CLOCK_REALTIME, &tv);
-	EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
+  timespec tv;
+  clock_gettime(CLOCK_REALTIME, &tv);
+  EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
 #endif
-	ElapsedTime += EndTime - StartTime;
+  ElapsedTime += EndTime - StartTime;
 }
 
 void HighResTimer::Reset()
 {
-	ElapsedTime = 0;
-	StartTime = 0;
-	running = 0;
+  ElapsedTime = 0;
+  StartTime = 0;
+  running = 0;
 }
 
-double HighResTimer::GetElapsedTime()
-{
-	return ElapsedTime / Frequency;
-}
+double HighResTimer::GetElapsedTime() { return ElapsedTime / Frequency; }
 
 double HighResTimer::GetCurrentElapsedTime()
 {
-	if (running == 0) return(GetElapsedTime());
-	double CurrentTime = 0;
+  if (running == 0)
+    return (GetElapsedTime());
+  double CurrentTime = 0;
 #ifdef _WIN32
-	__int64 iend;
-	QueryPerformanceCounter((LARGE_INTEGER*) &iend);
-	CurrentTime = (double) iend;
+  __int64 iend;
+  QueryPerformanceCounter((LARGE_INTEGER *) &iend);
+  CurrentTime = (double) iend;
 #else
-	timespec tv;
-	clock_gettime(CLOCK_REALTIME, &tv);
-	CurrentTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
+  timespec tv;
+  clock_gettime(CLOCK_REALTIME, &tv);
+  CurrentTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
 #endif
-	return((CurrentTime - StartTime + ElapsedTime) / Frequency);
+  return ((CurrentTime - StartTime + ElapsedTime) / Frequency);
 }
 
 double HighResTimer::GetFrequency()
 {
 #ifdef _WIN32
-	__int64 ifreq;
-	QueryPerformanceFrequency((LARGE_INTEGER*)&ifreq);
-	return((double) ifreq);
+  __int64 ifreq;
+  QueryPerformanceFrequency((LARGE_INTEGER *) &ifreq);
+  return ((double) ifreq);
 #else
-	return(1.0E9);
+  return (1.0E9);
 #endif
 }
 
@@ -114,9 +127,10 @@ void TimeStat::InitTimeStat(int nlogs)
 
 void TimeStat::EmptyTimeStat()
 {
-  if (tl == NULL) return;
+  if (tl == NULL)
+    return;
 
-  delete [ ] tl;
+  delete[] tl;
   tl = NULL;
   time = 0.;
 }
@@ -127,22 +141,25 @@ void TimeStat::ComputeTimeStat()
   vector<double> diff;
 
   for (int i = 0; i < total_logs; i++)
-    {
-      tl[i].sum = std::accumulate(tl[i].vec.begin(), tl[i].vec.end(), 0.0);
-      mean = tl[i].sum / tl[i].vec.size();
-
-      diff.resize(tl[i].vec.size());
-      std::transform(tl[i].vec.begin(), tl[i].vec.end(), diff.begin(), std::bind2nd(std::minus<double>(), mean));
-      sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-      tl[i].stdev = std::sqrt(sq_sum / tl[i].vec.size());
-    }
+  {
+    tl[i].sum = std::accumulate(tl[i].vec.begin(), tl[i].vec.end(), 0.0);
+    mean = tl[i].sum / tl[i].vec.size();
+
+    diff.resize(tl[i].vec.size());
+    std::transform(tl[i].vec.begin(), tl[i].vec.end(), diff.begin(),
+                   std::bind2nd(std::minus<double>(), mean));
+    sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+    tl[i].stdev = std::sqrt(sq_sum / tl[i].vec.size());
+  }
 }
 
 void TimeStat::PrintTimeStat(int mpi_rank)
 {
   ComputeTimeStat();
   for (int i = 0; i < total_logs; i++)
-    {
-      printf("SUMMARY -> %s: Total %f sec; Mean %f sec; Std.Dev. %f (rank %d)\n", tl[i].name.c_str(), tl[i].sum,  tl[i].sum / tl[i].vec.size(), tl[i].stdev, mpi_rank);
-    }
+  {
+    printf("SUMMARY -> %s: Total %f sec; Mean %f sec; Std.Dev. %f (rank %d)\n",
+           tl[i].name.c_str(), tl[i].sum, tl[i].sum / tl[i].vec.size(),
+           tl[i].stdev, mpi_rank);
+  }
 }