Commit ad9a1e47 authored by Luka Stanisic's avatar Luka Stanisic

prototype of a GPUWORKLOAD autotuning

parent 1459580e
...@@ -97,6 +97,7 @@ bioem::bioem() ...@@ -97,6 +97,7 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO")); FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT")); DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE")); nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = getenv("BIOEM_AUTOTUNING") == NULL ? 0 : atoi(getenv("BIOEM_AUTOTUNING"));
} }
bioem::~bioem() bioem::~bioem()
...@@ -520,6 +521,12 @@ int bioem::run() ...@@ -520,6 +521,12 @@ int bioem::run()
HighResTimer timer, timer2; HighResTimer timer, timer2;
/* This variables are used for Autotuning */
double best_time = 0;
int workload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
int best_workload = workload;
bool stopTuning=false;
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size); if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
...@@ -559,6 +566,7 @@ int bioem::run() ...@@ -559,6 +566,7 @@ int bioem::run()
{ {
// *** Calculating convolutions of projection map and crosscorrelations *** // *** Calculating convolutions of projection map and crosscorrelations ***
if (Autotuning && !stopTuning) timer.ResetStart();
if (DebugOutput >= 2) timer.ResetStart(); if (DebugOutput >= 2) timer.ResetStart();
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank); if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
...@@ -576,9 +584,10 @@ int bioem::run() ...@@ -576,9 +584,10 @@ int bioem::run()
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
if (DebugOutput >= 2) if (DebugOutput >= 2)
{ {
const double compTime = timer.GetCurrentElapsedTime(); compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts * const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime; (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
...@@ -588,12 +597,33 @@ int bioem::run() ...@@ -588,12 +597,33 @@ int bioem::run()
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank); printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
} }
if (Autotuning && !stopTuning && (iConv % 5 == 4))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
if (best_time==0 || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
if (workload < 30)
{
stopTuning=true;
workload=best_workload;
}
deviceFinishRun();
rebalance(workload);
deviceStartRun();
}
} }
if (DebugOutput >= 1) if (DebugOutput >= 1)
{ {
printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank); printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank);
timer2.ResetStart(); timer2.ResetStart();
} }
} }
} }
//deallocating fftw_complex vector //deallocating fftw_complex vector
...@@ -1395,7 +1425,7 @@ int bioem::deviceInit() ...@@ -1395,7 +1425,7 @@ int bioem::deviceInit()
} }
int bioem::deviceStartRun() int bioem::deviceStartRun()
{ { deviceInit();
return(0); return(0);
} }
...@@ -1413,3 +1443,5 @@ void bioem::free_device_host(void* ptr) ...@@ -1413,3 +1443,5 @@ void bioem::free_device_host(void* ptr)
{ {
free(ptr); free(ptr);
} }
void bioem::rebalance(int workload) {}
...@@ -572,6 +572,19 @@ void bioem_cuda::free_device_host(void* ptr) ...@@ -572,6 +572,19 @@ void bioem_cuda::free_device_host(void* ptr)
cudaFreeHost(ptr); cudaFreeHost(ptr);
} }
void bioem_cuda::rebalance(int workload)
{
if ((workload < 0) || (workload > 100) || (workload > GPUWorkload)) return;
if (DebugOutput >= 1)
{
printf("\t\tSetting GPU workload to %d%%\n", workload);
}
GPUWorkload = workload;
maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
}
bioem* bioem_cuda_create() bioem* bioem_cuda_create()
{ {
int count; int count;
......
...@@ -42,6 +42,7 @@ public: ...@@ -42,6 +42,7 @@ public:
virtual void* malloc_device_host(size_t size); virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr); virtual void free_device_host(void* ptr);
virtual void rebalance(int workload);
int createProjection(int iMap, mycomplex_t* map); int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare); int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
...@@ -71,6 +72,7 @@ protected: ...@@ -71,6 +72,7 @@ protected:
int FFTAlgo; //Use the FFT Algorithm (Default 1) int FFTAlgo; //Use the FFT Algorithm (Default 1)
int DebugOutput; //Debug Output Level (Default 2) int DebugOutput; //Debug Output Level (Default 2)
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1) int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1)
int Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
}; };
#endif #endif
...@@ -33,6 +33,7 @@ public: ...@@ -33,6 +33,7 @@ public:
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size); virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr); virtual void free_device_host(void* ptr);
virtual void rebalance(int workload);
protected: protected:
virtual int deviceInit(); virtual int deviceInit();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment