diff --git a/bioem.cpp b/bioem.cpp index adddebb4269c6bddd295dc704b523f4f4038ad84..fe1af043c5e372b670005b66329eccb14c9e6cd9 100644 --- a/bioem.cpp +++ b/bioem.cpp @@ -97,6 +97,7 @@ bioem::bioem() FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO")); DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT")); nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE")); + Autotuning = getenv("BIOEM_AUTOTUNING") == NULL ? 0 : atoi(getenv("BIOEM_AUTOTUNING")); } bioem::~bioem() @@ -520,6 +521,12 @@ int bioem::run() HighResTimer timer, timer2; + /* This variables are used for Autotuning */ + double best_time = 0; + int workload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD")); + int best_workload = workload; + bool stopTuning=false; + if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size); @@ -559,6 +566,7 @@ int bioem::run() { // *** Calculating convolutions of projection map and crosscorrelations *** + if (Autotuning && !stopTuning) timer.ResetStart(); if (DebugOutput >= 2) timer.ResetStart(); createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank); @@ -576,9 +584,10 @@ int bioem::run() compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV); + double compTime=0.; if (DebugOutput >= 2) { - const double compTime = timer.GetCurrentElapsedTime(); + compTime = timer.GetCurrentElapsedTime(); const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts * (((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime; @@ -588,12 +597,33 @@ int bioem::run() printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank); } + if (Autotuning && !stopTuning && (iConv % 5 == 4)) + { + if (compTime == 0.) compTime = timer.GetCurrentElapsedTime(); + + if (best_time==0 || compTime < best_time) + { + best_time = compTime; + best_workload = workload; + } + + workload -= 5; + if (workload < 30) + { + stopTuning=true; + workload=best_workload; + } + + deviceFinishRun(); + rebalance(workload); + deviceStartRun(); + } } if (DebugOutput >= 1) { printf("\tTotal time for projection %d: %f (rank %d)\n", iOrient, timer2.GetCurrentElapsedTime(), mpi_rank); - timer2.ResetStart(); - } + timer2.ResetStart(); + } } } //deallocating fftw_complex vector @@ -1395,7 +1425,7 @@ int bioem::deviceInit() } int bioem::deviceStartRun() -{ +{ deviceInit(); return(0); } @@ -1413,3 +1443,5 @@ void bioem::free_device_host(void* ptr) { free(ptr); } + +void bioem::rebalance(int workload) {} diff --git a/bioem_cuda.cu b/bioem_cuda.cu index f2f86bb7c38143c95c6d03e905dca9d5afb93773..e2be9e5c3bee492a6e4fc5f778bfcf3d815bf552 100644 --- a/bioem_cuda.cu +++ b/bioem_cuda.cu @@ -572,6 +572,19 @@ void bioem_cuda::free_device_host(void* ptr) cudaFreeHost(ptr); } +void bioem_cuda::rebalance(int workload) +{ + if ((workload < 0) || (workload > 100) || (workload > GPUWorkload)) return; + + if (DebugOutput >= 1) + { + printf("\t\tSetting GPU workload to %d%%\n", workload); + } + + GPUWorkload = workload; + maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100; +} + bioem* bioem_cuda_create() { int count; diff --git a/include/bioem.h b/include/bioem.h index 7c0e0d91a9008981567f240152843f14fcce8e39..5714f52487ad32b302ce2eee0a4853ac67f3ec5c 100644 --- a/include/bioem.h +++ b/include/bioem.h @@ -42,6 +42,7 @@ public: virtual void* malloc_device_host(size_t size); virtual void free_device_host(void* ptr); + virtual void rebalance(int workload); int createProjection(int iMap, mycomplex_t* map); int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare); @@ -71,6 +72,7 @@ protected: int FFTAlgo; //Use the FFT Algorithm (Default 1) int DebugOutput; //Debug Output Level (Default 2) int nProjectionsAtOnce; //Number of projections to do at once via OpenMP (Default 1) + int Autotuning; //Do the autotuning of the load-balancing between CPUs and GPUs }; #endif diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h index 60939da01043aded5181a829ed9d8425da0c688a..4d792f66b43ba0a0fb3011baa5d6ac42ff8a4680 100644 --- a/include/bioem_cuda_internal.h +++ b/include/bioem_cuda_internal.h @@ -33,6 +33,7 @@ public: virtual int compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t pha, myfloat_t env, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0); virtual void* malloc_device_host(size_t size); virtual void free_device_host(void* ptr); + virtual void rebalance(int workload); protected: virtual int deviceInit();