Commit 4b034659 authored by Luka Stanisic's avatar Luka Stanisic

rel2: code development

parent 254d53db
...@@ -29,7 +29,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") ...@@ -29,7 +29,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
include_directories(include) include_directories(include)
set (BIOEM_ICC_FLAGS "-xHost -O3 -fno-alias -fno-fnalias -unroll -g0 -ipo") set (BIOEM_ICC_FLAGS "-O3 -fno-alias -fno-fnalias -unroll -g0 -ip")
set (BIOEM_GCC_FLAGS "-O3 -march=native -fweb -mfpmath=sse -frename-registers -minline-all-stringops -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -ggdb") set (BIOEM_GCC_FLAGS "-O3 -march=native -fweb -mfpmath=sse -frename-registers -minline-all-stringops -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -ggdb")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
...@@ -50,11 +50,6 @@ if (NOT FFTW_FOUND) ...@@ -50,11 +50,6 @@ if (NOT FFTW_FOUND)
endif() endif()
include_directories(${FFTW_INCLUDE_DIRS}) include_directories(${FFTW_INCLUDE_DIRS})
find_package( Boost 1.43 REQUIRED COMPONENTS program_options )
include_directories( ${Boost_INCLUDE_DIRS} )
###Find Optional Packages ###Find Optional Packages
###Find CUDA ###Find CUDA
...@@ -163,7 +158,6 @@ if (FFTWF_LIBRARIES) ...@@ -163,7 +158,6 @@ if (FFTWF_LIBRARIES)
else() else()
target_link_libraries(bioEM -L${FFTW_LIBDIR} -lfftw3 -lfftw3f) target_link_libraries(bioEM -L${FFTW_LIBDIR} -lfftw3 -lfftw3f)
endif() endif()
target_link_libraries(bioEM ${Boost_PROGRAM_OPTIONS_LIBRARY})
if (MPI_FOUND) if (MPI_FOUND)
target_link_libraries(bioEM ${MPI_LIBRARIES}) target_link_libraries(bioEM ${MPI_LIBRARIES})
...@@ -172,7 +166,6 @@ endif() ...@@ -172,7 +166,6 @@ endif()
###Show Status ###Show Status
message(STATUS "Build Status") message(STATUS "Build Status")
message(STATUS "FFTW library: ${FFTW_LIBDIR}") message(STATUS "FFTW library: ${FFTW_LIBDIR}")
message(STATUS "Boost directory: ${Boost_LIBRARY_DIRS}")
message(STATUS "FFTW includedir: ${FFTW_INCLUDEDIR}") message(STATUS "FFTW includedir: ${FFTW_INCLUDEDIR}")
message(STATUS "CUDA libraries: ${CUDA_CUDA_LIBRARY}") message(STATUS "CUDA libraries: ${CUDA_CUDA_LIBRARY}")
message(STATUS "CUDART libraries: ${CUDA_LIBRARIES}") message(STATUS "CUDART libraries: ${CUDA_LIBRARIES}")
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2017 Pilar Cossio, Markus Rampp, Luka Stanisic and Gerhard
Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#include "autotuner.h" #include "autotuner.h"
void Autotuner::Reset() void Autotuner::Reset()
...@@ -16,19 +28,22 @@ void Autotuner::Reset() ...@@ -16,19 +28,22 @@ void Autotuner::Reset()
fb = 0.; fb = 0.;
fx = 0.; fx = 0.;
if (algo == 3) workload = 50; if (algo == 3)
workload = 50;
} }
bool Autotuner::Needed(int iteration) bool Autotuner::Needed(int iteration)
{ {
if (stopTuning) return false; if (stopTuning)
return false;
switch (algo) switch (algo)
{ {
case 1: case 1:
case 3: case 3:
return iteration % (stable + 1) == stable; return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable); case 2:
return (iteration == (int) stable / 2) || (iteration == stable);
default: /* Should never happen */; default: /* Should never happen */;
} }
return false; return false;
...@@ -46,10 +61,12 @@ bool Autotuner::Finished() ...@@ -46,10 +61,12 @@ bool Autotuner::Finished()
} }
break; break;
case 2: case 2:
if (best_workload != 0) return stopTuning = true; if (best_workload != 0)
return stopTuning = true;
break; break;
case 3: case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true; if ((c - b == limit) && (b - a == limit))
return stopTuning = true;
break; break;
default: /* Should never happen */; default: /* Should never happen */;
} }
...@@ -60,9 +77,15 @@ void Autotuner::Tune(double compTime) ...@@ -60,9 +77,15 @@ void Autotuner::Tune(double compTime)
{ {
switch (algo) switch (algo)
{ {
case 1: AlgoSimple(compTime); break; case 1:
case 2: AlgoRatio(compTime); break; AlgoSimple(compTime);
case 3: AlgoBisection(compTime); break; break;
case 2:
AlgoRatio(compTime);
break;
case 3:
AlgoBisection(compTime);
break;
default: /* Should never happen */; default: /* Should never happen */;
} }
} }
...@@ -121,6 +144,6 @@ void Autotuner::AlgoBisection(double compTime) ...@@ -121,6 +144,6 @@ void Autotuner::AlgoBisection(double compTime)
c = x; c = x;
} }
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2); x = (c - b > b - a) ? (int) (b + (c - b) / 2) : (int) (a + (b - a + 1) / 2);
workload = x; workload = x;
} }
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, Copyright (C) 2017 Pilar Cossio, Markus Rampp, Luka Stanisic and Gerhard
Volker Lindenstruth and Gerhard Hummer. Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3.
See license statement for terms of distribution. See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
...@@ -11,13 +13,19 @@ ...@@ -11,13 +13,19 @@
#ifndef AUTOTUNER_H #ifndef AUTOTUNER_H
#define AUTOTUNER_H #define AUTOTUNER_H
class Autotuner { class Autotuner
{
public: public:
Autotuner() {stopTuning = true;} Autotuner() { stopTuning = true; }
/* Setting variables to initial values */ /* Setting variables to initial values */
inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); } inline void Initialize(int alg = 3, int st = 7)
{
algo = alg;
stable = st;
Reset();
}
/* Resetting variables to initial values */ /* Resetting variables to initial values */
void Reset(); void Reset();
...@@ -32,7 +40,7 @@ public: ...@@ -32,7 +40,7 @@ public:
void Tune(double compTime); void Tune(double compTime);
/* Return workload value */ /* Return workload value */
inline int Workload() {return workload;} inline int Workload() { return workload; }
private: private:
int algo; int algo;
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer. Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany. Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
Germany.
Max Planck Computing and Data Facility, Garching, Germany. Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3. Released under the GNU Public License, v3.
...@@ -14,10 +15,10 @@ ...@@ -14,10 +15,10 @@
#ifndef BIOEM_H #ifndef BIOEM_H
#define BIOEM_H #define BIOEM_H
#include "defs.h"
#include "bioem.h" #include "bioem.h"
#include "model.h" #include "defs.h"
#include "map.h" #include "map.h"
#include "model.h"
#include "param.h" #include "param.h"
class bioem class bioem
...@@ -29,31 +30,48 @@ public: ...@@ -29,31 +30,48 @@ public:
bioem(); bioem();
virtual ~bioem(); virtual ~bioem();
int configure(int ac, char* av[]); void printOptions(myoption_t *myoptions, int myoptions_length);
void cleanup(); //Cleanup everything happening during configure int readOptions(int ac, char *av[]);
int configure(int ac, char *av[]);
void cleanup(); // Cleanup everything happening during configure
int precalculate(); // Is it better to pass directly the input File names? int precalculate(); // Is it better to pass directly the input File names?
int dopreCalCrossCorrelation(int iRefMap, int iRefMapLocal); inline int needToPrintModel() { return param.printModel; }
int printModel();
int run(); int run();
int doProjections(int iMap); int doProjections(int iMap);
int createConvolutedProjectionMap(int iOreint, int iMap, mycomplex_t* lproj, myfloat_t* Mapconv, mycomplex_t* localmultFFT, myfloat_t& sumC, myfloat_t& sumsquareC); int createConvolutedProjectionMap(int iOreint, int iMap, mycomplex_t *lproj,
mycomplex_t *localmultFFT, myfloat_t &sumC,
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); myfloat_t &sumsquareC);
int createConvolutedProjectionMap_noFFT(mycomplex_t *lproj,
virtual void* malloc_device_host(size_t size); myfloat_t *Mapconv,
virtual void free_device_host(void* ptr); mycomplex_t *localmultFFT,
virtual void rebalance(int workload); //Rebalance GPUWorkload myfloat_t &sumC,
void rebalanceWrapper(int workload); //Rebalance wrapper myfloat_t &sumsquareC);
int createProjection(int iMap, mycomplex_t* map); virtual int compareRefMaps(int iPipeline, int iOrient, int iConv,
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare); int maxParallelConv, mycomplex_t *localmultFFT,
void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT, mycomplex_t* localCCT, myfloat_t* lCC); myparam5_t *comp_params, const int startMap = 0);
virtual void *malloc_device_host(size_t size);
virtual void free_device_host(void *ptr);
virtual void rebalance(int workload); // Rebalance GPUWorkload
void rebalanceWrapper(int workload); // Rebalance wrapper
int createProjection(int iMap, mycomplex_t *map);
int calcross_cor(myfloat_t *localmap, myfloat_t &sum, myfloat_t &sumsquare);
void calculateCCFFT(int iMap, mycomplex_t *localConvFFT,
mycomplex_t *localCCT, myfloat_t *lCC);
void doRefMap_CPU_Parallel(int iRefMap, int iOrient, int iConv,
myfloat_t *lCC, myparam5_t *comp_params,
myblockCPU_t *comp_block);
void doRefMap_CPU_Reduce(int iRefMap, int iOrient, int iConvStart,
int maxParallelConv, myparam5_t *comp_params,
myblockCPU_t *comp_block);
bioem_Probability pProb; bioem_Probability pProb;
string OutfileName; string OutfileName;
bool yesoutfilename;
protected: protected:
virtual int deviceInit(); virtual int deviceInit();
...@@ -64,16 +82,20 @@ protected: ...@@ -64,16 +82,20 @@ protected:
bioem_model Model; bioem_model Model;
bioem_RefMap RefMap; bioem_RefMap RefMap;
int nReferenceMaps; //Maps in memory at a time int nReferenceMaps; // Maps in memory at a time
int nReferenceMapsTotal; //Maps in total int nReferenceMapsTotal; // Maps in total
int nProjectionMaps; //Maps in memory at a time int nProjectionMaps; // Maps in memory at a time
int nProjectionMapsTotal; //Maps in total int nProjectionMapsTotal; // Maps in total
int FFTAlgo; //Use the FFT Algorithm (Default 1) int BioEMAlgo; // BioEM algorithm used to do comparison (Default 1)
int DebugOutput; //Debug Output Level (Default 2) int CudaThreadCount; // Number of CUDA threads used in each block (Default
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1) // depends on the BioEM algorithm)
bool Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs int DebugOutput; // Debug Output Level (Default 0)
int nProjectionsAtOnce; // Number of projections to do at once via OpenMP
// (Default number of OMP threads)
bool Autotuning; // Do the autotuning of the load-balancing between CPUs and
// GPUs (Default 1, if GPUs are used and GPUWORKLOAD is not specified)
}; };
#endif #endif
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer. Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany. Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
Germany.
Max Planck Computing and Data Facility, Garching, Germany. Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3. Released under the GNU Public License, v3.
...@@ -16,6 +17,6 @@ ...@@ -16,6 +17,6 @@
#include "bioem.h" #include "bioem.h"
extern bioem* bioem_cuda_create(); extern bioem *bioem_cuda_create();
#endif #endif
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer. Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany. Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
Germany.
Max Planck Computing and Data Facility, Garching, Germany. Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3. Released under the GNU Public License, v3.
...@@ -17,7 +18,7 @@ ...@@ -17,7 +18,7 @@
#include <cuda.h> #include <cuda.h>
#include <cufft.h> #include <cufft.h>
//Hack to make nvcc compiler accept fftw.h, float128 is not used anyway // Hack to make nvcc compiler accept fftw.h, float128 is not used anyway
#define __float128 double #define __float128 double
#include <fftw3.h> #include <fftw3.h>
#undef __float128 #undef __float128
...@@ -30,10 +31,12 @@ public: ...@@ -30,10 +31,12 @@ public:
bioem_cuda(); bioem_cuda();
virtual ~bioem_cuda(); virtual ~bioem_cuda();
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); virtual int compareRefMaps(int iPipeline, int iOrient, int iConv,
virtual void* malloc_device_host(size_t size); int maxParallelConv, mycomplex_t *localmultFFT,
virtual void free_device_host(void* ptr); myparam5_t *comp_params, const int startMap = 0);
virtual void rebalance(int workload); //Rebalance GPUWorkload virtual void *malloc_device_host(size_t size);
virtual void free_device_host(void *ptr);
virtual void rebalance(int workload); // Rebalance GPUWorkload
protected: protected:
virtual int deviceInit(); virtual int deviceInit();
...@@ -46,32 +49,39 @@ private: ...@@ -46,32 +49,39 @@ private:
int deviceInitialized; int deviceInitialized;
cudaStream_t cudaStream[3]; cudaStream_t cudaStream[PIPELINE_LVL + 1]; // Streams are used for both
cudaEvent_t cudaEvent[3]; // PIPELINE and MULTISTREAM control
cudaEvent_t cudaFFTEvent[2]; cudaEvent_t cudaEvent[PIPELINE_LVL + 1];
bioem_RefMap_Mod* pRefMap_device_Mod; cudaEvent_t cudaFFTEvent[MULTISTREAM_LVL];
bioem_RefMap* gpumap; bioem_RefMap *gpumap;
bioem_Probability* pProb_host; bioem_Probability *pProb_host;
bioem_Probability pProb_device; bioem_Probability pProb_device;
void* pProb_memory; void *pProb_memory;
myfloat_t* pConvMap_device[2];
mycomplex_t* pRefMapsFFT; mycomplex_t *pRefMapsFFT;
mycomplex_t* pConvMapFFT; mycomplex_t *pConvMapFFT;
mycomplex_t* pConvMapFFT_Host; mycomplex_t *pConvMapFFT_Host;
mycuComplex_t* pFFTtmp2[2]; mycuComplex_t *pFFTtmp2[MULTISTREAM_LVL];
myfloat_t* pFFTtmp[2]; myfloat_t *pFFTtmp[MULTISTREAM_LVL];
cufftHandle plan[2][2]; cufftHandle plan[SPLIT_MAPS_LVL][MULTISTREAM_LVL];
myfloat_t *maps, *sum, *sumsquare; myparam5_t *pTmp_comp_params;
int GPUAlgo; //GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best) myblockGPU_t *pTmp_comp_blocks;
int GPUAsync; //Run GPU Asynchronously, do the convolutions on the host in parallel. int Ncomp_blocks;
int GPUDualStream; //Use two streams to improve paralelism
int GPUWorkload; //Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel. bool *initialized_const; // In order to make sure Constoadd is initialized to
// the first value
myfloat_t *sum, *sumsquare;
int GPUAsync; // Run GPU Asynchronously, do the convolutions on the host in
// parallel.
int GPUDualStream; // Use two streams to improve paralelism
int GPUWorkload; // Percentage of workload to perform on GPU. Default 100.
// Rest is done on processor in parallel.
int maxRef; int maxRef;
}; };
#endif #endif
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images> < BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp, Copyright (C) 2017 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer. Luka Stanisic, Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany. Max Planck Institute of Biophysics, Frankfurt, Germany.
Frankfurt Institute for Advanced Studies, Goethe University Frankfurt, Germany. Frankfurt Institute for Advanced Studies, Goethe University Frankfurt,
Germany.
Max Planck Computing and Data Facility, Garching, Germany. Max Planck Computing and Data Facility, Garching, Germany.
Released under the GNU Public License, v3. Released under the GNU Public License, v3.
...@@ -14,11 +15,22 @@ ...@@ -14,11 +15,22 @@
#ifndef BIOEM_DEFS_H #ifndef BIOEM_DEFS_H
#define BIOEM_DEFS_H #define BIOEM_DEFS_H
#define BIOEM_PROB_DOUBLE
//#define BIOEM_USE_DOUBLE //#define BIOEM_USE_DOUBLE
//#define DEBUG
//#define DEBUG_GPU
//#define DEBUG_PROB
//#define PILAR_DEBUG #ifndef BIOEM_PROB_DOUBLE
typedef float myprob_t;
#define MY_MPI_FLOAT MPI_FLOAT
#else
typedef double myprob_t;
#define MY_MPI_FLOAT MPI_DOUBLE
#endif
#ifndef BIOEM_USE_DOUBLE #ifndef BIOEM_USE_DOUBLE
#define MIN_PROB -999999.
typedef float myfloat_t; typedef float myfloat_t;
#define myfftw_malloc fftwf_malloc #define myfftw_malloc fftwf_malloc
#define myfftw_free fftwf_free #define myfftw_free fftwf_free
...@@ -35,9 +47,9 @@ typedef float myfloat_t; ...@@ -35,9 +47,9 @@ typedef float myfloat_t;
#define MY_CUFFT_C2R CUFFT_C2R #define MY_CUFFT_C2R CUFFT_C2R
#define mycufftExecC2R cufftExecC2R #define mycufftExecC2R cufftExecC2R
#define mycuComplex_t cuComplex #define mycuComplex_t cuComplex
#define MY_MPI_FLOAT MPI_FLOAT
#else #else
typedef double myfloat_t; typedef double myfloat_t;
#define MIN_PROB -999999.
#define myfftw_malloc fftw_malloc #define myfftw_malloc fftw_malloc
#define myfftw_free fftw_free #define myfftw_free fftw_free
#define myfftw_destroy_plan fftw_destroy_plan #define myfftw_destroy_plan fftw_destroy_plan
...@@ -53,11 +65,10 @@ typedef double myfloat_t; ...@@ -53,11 +65,10 @@ typedef double myfloat_t;
#define mycufftExecC2R cufftExecZ2D #define mycufftExecC2R cufftExecZ2D
#define mycuComplex_t cuDoubleComplex #define mycuComplex_t cuDoubleComplex
#define MY_CUFFT_C2R CUFFT_Z2D #define MY_CUFFT_C2R CUFFT_Z2D
#define MY_MPI_FLOAT MPI_DOUBLE
#endif #endif
typedef myfloat_t mycomplex_t[2]; typedef myfloat_t mycomplex_t[2];
#define BIOEM_FLOAT_3_PHYSICAL_SIZE 3 //Possible set to 4 for GPU #define BIOEM_FLOAT_3_PHYSICAL_SIZE 3 // Possible set to 4 for GPU
struct myfloat3_t struct myfloat3_t
{ {
...@@ -66,6 +77,52 @@ struct myfloat3_t ...@@ -66,6 +77,52 @@ struct myfloat3_t
// myfloat_t prior; // myfloat_t prior;
}; };
/* myoptions
Structure for saving options, in order to mimic old Boost program_options
behaviour
*/
struct myoption_t
{
const char *name;
int arg;
const char *desc;
bool hidden;
};
/* comp_params
Put all parameters needed for each comparison in a single structure
This makes code cleaner and requires less GPU transfers
*/
struct myparam5_t
{
myfloat_t amp;
myfloat_t pha;
myfloat_t env;
myfloat_t sumC;
myfloat_t sumsquareC;
};
/* comp_block
Put all parameters created by each inside-block comparison
This makes code cleaner
*/
// For GPUs
struct myblockGPU_t
{
myprob_t logpro;
int id;
myprob_t sumExp;
myprob_t sumAngles;
};
// For CPUs (easier to save value as well)
struct myblockCPU_t
{
myprob_t logpro;
int id;
myprob_t sumExp;
myfloat_t value;
};
#ifdef BIOEM_GPUCODE #ifdef BIOEM_GPUCODE
#define myThreadIdxX threadIdx.x #define myThreadIdxX threadIdx.x
#define myThreadIdxY threadIdx.y #define myThreadIdxY threadIdx.y
...@@ -85,44 +142,53 @@ struct myfloat3_t ...@@ -85,44 +142,53 @@ struct myfloat3_t
#define myBlockIdxY 0 #define myBlockIdxY 0
#endif #endif
#define CUDA_THREAD_COUNT 256 #define OUTPUT_PRECISION 4
#define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024 #define CUDA_THREAD_COUNT_ALGO1 256
#define CUDA_THREAD_COUNT_ALGO2 512
#define CUDA_THREAD_MAX 1024
#define CUDA_FFTS_AT_ONCE 1024 #define CUDA_FFTS_AT_ONCE 1024
//#define BIOEM_USE_NVTX
#define PIPELINE_LVL 2
#define MULTISTREAM_LVL 2
#define SPLIT_MAPS_LVL 2
/* Autotuning /* Autotuning
Autotuning algorithms: Autotuning algorithms:
1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples of 5. Taking the value with the best timing. 1. AlgoSimple = 1; Testing workload values between 100 and 30, all multiples
2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the workload are timed, and then the optimal workload balance is computed. of 5. Taking the value with the best timing.
3. AlgoBisection = 3; Based on bisection, multiple workload values are tested until the optimal one is found. 2. AlgoRatio = 2; Comparisons where GPU handles 100% or only 1% of the
workload are timed, and then the optimal workload balance is computed.
3. AlgoBisection = 3; Based on bisection, multiple workload values are
tested until the optimal one is found.
*/ */
#define AUTOTUNING_ALGORITHM 3 #define AUTOTUNING_ALGORITHM 3
/* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de facto disable recalibration */ /* Recalibrate every X projections. Put to a very high value, i.e., 99999, to de
* facto disable recalibration */
#define RECALIB_FACTOR 200 #define RECALIB_FACTOR 200