Commit ad9a1e47 authored by Luka Stanisic's avatar Luka Stanisic

prototype of a GPUWORKLOAD autotuning

parent 1459580e
......@@ -97,6 +97,7 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = getenv("BIOEM_AUTOTUNING") == NULL ? 0 : atoi(getenv("BIOEM_AUTOTUNING"));
}
bioem::~bioem()
......@@ -520,6 +521,12 @@ int bioem::run()
HighResTimer timer, timer2;
/* This variables are used for Autotuning */
double best_time = 0;
int workload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
int best_workload = workload;
bool stopTuning=false;
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -559,6 +566,7 @@ int bioem::run()
{
// *** Calculating convolutions of projection map and crosscorrelations ***
if (Autotuning && !stopTuning) timer.ResetStart();
if (DebugOutput >= 2) timer.ResetStart();
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
......@@ -576,9 +584,10 @@ int bioem::run()
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
if (DebugOutput >= 2)
{
const double compTime = timer.GetCurrentElapsedTime();
compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
......@@ -588,12 +597,33 @@ int bioem::run()
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning && !stopTuning && (iConv % 5 == 4))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
if (best_time==0 || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
if (workload < 30)
{
stopTuning=true;
workload=best_workload;
}
deviceFinishRun();
rebalance(workload);
deviceStartRun();
}
}
if (DebugOutput >= 1)
{
printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank);
timer2.ResetStart();
}
timer2.ResetStart();
}
}
}
//deallocating fftw_complex vector
......@@ -1395,7 +1425,7 @@ int bioem::deviceInit()
}
int bioem::deviceStartRun()
{
{ deviceInit();
return(0);
}
......@@ -1413,3 +1443,5 @@ void bioem::free_device_host(void* ptr)
{
free(ptr);
}
void bioem::rebalance(int workload) {}
......@@ -572,6 +572,19 @@ void bioem_cuda::free_device_host(void* ptr)
cudaFreeHost(ptr);
}
void bioem_cuda::rebalance(int workload)
{
if ((workload < 0) || (workload > 100) || (workload > GPUWorkload)) return;
if (DebugOutput >= 1)
{
printf("\t\tSetting GPU workload to %d%%\n", workload);
}
GPUWorkload = workload;
maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
}
bioem* bioem_cuda_create()
{
int count;
......
......@@ -42,6 +42,7 @@ public:
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload);
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
......@@ -71,6 +72,7 @@ protected:
int FFTAlgo; //Use the FFT Algorithm (Default 1)
int DebugOutput; //Debug Output Level (Default 2)
int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1)
int Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs
};
#endif
......@@ -33,6 +33,7 @@ public:
virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
virtual void rebalance(int workload);
protected:
virtual int deviceInit();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment