Commit efa38150 authored by Pilar Cossio's avatar Pilar Cossio
Browse files

Merge branch 'BioEM-1.0' into 'BioEM-1.0'

minor fixes

See merge request !2
parents 665e1ec1 dc843f20
Pipeline #14456 passed with stage
in 40 seconds
......@@ -37,7 +37,7 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BIOEM_GCC_FLAGS}")
endif()
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp")
set (BIOEM_SOURCE_FILES "bioem.cpp" "main.cpp" "map.cpp" "model.cpp" "param.cpp" "timer.cpp" "autotuner.cpp")
###Find Required Packages
find_package(PkgConfig)
......@@ -67,6 +67,7 @@ if (CUDA_FOUND)
if (CUDA_FORCE_GCC)
cmake_minimum_required(VERSION 2.8.10.1)
#Use GCC as host compiler for CUDA even though host compiler for other files is not GCC
#set (CUDA_HOST_COMPILER /mpcdf/soft/SLES122/common/gcc/5.4.0/bin/gcc)
set (CUDA_HOST_COMPILER gcc)
endif()
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--use_fast_math;-ftz=true;-O4;-Xptxas -O4")
......
File mode changed from 100644 to 100755
#include "autotuner.h"
void Autotuner::Reset()
{
stopTuning = false;
workload = 100;
best_time = 0.;
best_workload = 0;
a = 1;
b = 50;
c = 100;
x = 50;
limit = 1;
fb = 0.;
fx = 0.;
if (algo == 3) workload = 50;
}
bool Autotuner::Needed(int iteration)
{
if (stopTuning) return false;
switch (algo)
{
case 1:
case 3:
return iteration % (stable + 1) == stable;
case 2: return (iteration == (int) stable / 2 ) || (iteration == stable);
default: /* Should never happen */;
}
return false;
}
bool Autotuner::Finished()
{
switch (algo)
{
case 1:
if (workload < 30)
{
workload = best_workload;
return stopTuning = true;
}
break;
case 2:
if (best_workload != 0) return stopTuning = true;
break;
case 3:
if ((c - b == limit) && (b - a == limit)) return stopTuning = true;
break;
default: /* Should never happen */;
}
return false;
}
void Autotuner::Tune(double compTime)
{
switch (algo)
{
case 1: AlgoSimple(compTime); break;
case 2: AlgoRatio(compTime); break;
case 3: AlgoBisection(compTime); break;
default: /* Should never happen */;
}
}
void Autotuner::AlgoSimple(double compTime)
{
if (best_time == 0. || compTime < best_time)
{
best_time = compTime;
best_workload = workload;
}
workload -= 5;
}
void Autotuner::AlgoRatio(double compTime)
{
if (best_time == 0.)
{
best_time = compTime;
workload = 1;
}
else
{
best_workload = (int) 100 * (compTime / (best_time + compTime));
workload = best_workload;
}
}
void Autotuner::AlgoBisection(double compTime)
{
if (fb == 0.)
{
fb = compTime;
x = 75;
workload = x;
return;
}
fx = compTime;
if (fx < fb)
{
if (x < b)
c = b;
else
a = b;
b = x;
fb = fx;
}
else
{
if (x < b)
a = x;
else
c = x;
}
x = (c-b > b-a) ? (int)(b+(c-b)/2) : (int)(a+(b-a+1)/2);
workload = x;
}
......@@ -41,6 +41,7 @@
#include <fftw3.h>
#include <math.h>
#include "timer.h"
#include "autotuner.h"
#include "param.h"
#include "bioem.h"
......@@ -97,6 +98,7 @@ bioem::bioem()
FFTAlgo = getenv("FFTALGO") == NULL ? 1 : atoi(getenv("FFTALGO"));
DebugOutput = getenv("BIOEM_DEBUG_OUTPUT") == NULL ? 2 : atoi(getenv("BIOEM_DEBUG_OUTPUT"));
nProjectionsAtOnce = getenv("BIOEM_PROJECTIONS_AT_ONCE") == NULL ? 1 : atoi(getenv("BIOEM_PROJECTIONS_AT_ONCE"));
Autotuning = false;
}
bioem::~bioem()
......@@ -363,19 +365,36 @@ int bioem::configure(int ac, char* av[])
printf("Time Precalculate %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
if(!param.printModel)pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, *this);
if (DebugOutput >= 2 && mpi_rank == 0)
// ****************** For autotuning **********************
if ((getenv("GPU") && atoi(getenv("GPU"))) && ((!getenv("GPUWORKLOAD") || (atoi(getenv("GPUWORKLOAD")) == -1))) && (!getenv("BIOEM_DEBUG_BREAK") || (atoi(getenv("BIOEM_DEBUG_BREAK")) > FIRST_STABLE)))
{
printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
Autotuning = true;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload enabled:\n\tAlgorithm %d\n\tRecalibration at every %d projections\n\tComparisons are considered stable after first %d comparisons\n", AUTOTUNING_ALGORITHM, RECALIB_FACTOR, FIRST_STABLE);
}
else
{
Autotuning = false;
if (mpi_rank == 0) printf("Autotuning of GPUWorkload disabled\n");
}
// ****************** Initializng pointers *********************
// ****************** Initializing pointers *********************
deviceInit();
if (DebugOutput >= 2 && mpi_rank == 0) printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
if (DebugOutput >= 2 && mpi_rank == 0)
{
printf("Time Device Init %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
if(!param.printModel)pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, param.nTotCC, *this);
if (DebugOutput >= 2 && mpi_rank == 0)
{
printf("Time Init Probabilities %f\n", timer.GetCurrentElapsedTime());
timer.ResetStart();
}
return(0);
}
......@@ -520,6 +539,14 @@ int bioem::run()
HighResTimer timer, timer2;
/* Autotuning */
Autotuner aut;
if (Autotuning)
{
aut.Initialize(AUTOTUNING_ALGORITHM, FIRST_STABLE);
rebalance(aut.Workload());
}
if (DebugOutput >= 1 && mpi_rank == 0) printf("\tMain Loop GridAngles %d, CTFs %d, RefMaps %d, Shifts (%d/%d)², Pixels %d², OMP Threads %d, MPI Ranks %d\n", param.nTotGridAngles, param.nTotCTFs, RefMap.ntotRefMap, 2 * param.param_device.maxDisplaceCenter + param.param_device.GridSpaceCenter, param.param_device.GridSpaceCenter, param.param_device.NumberPixels, omp_get_max_threads(), mpi_size);
......@@ -548,6 +575,13 @@ int bioem::run()
}
if (DebugOutput >= 2) printf("\tTime Projection %d: %f (rank %d)\n", iOrientAtOnce, timer.GetCurrentElapsedTime(), mpi_rank);
/* Recalibrate if needed */
if (Autotuning && ((iOrientAtOnce - iOrientStart) % RECALIB_FACTOR == 0) && ((iOrientEnd - iOrientAtOnce) > RECALIB_FACTOR) && (iOrientAtOnce != iOrientStart))
{
aut.Reset();
rebalance(aut.Workload());
}
for (int iOrient = iOrientAtOnce; iOrient < iTmpEnd;iOrient++)
{
mycomplex_t* proj_mapFFT = &proj_mapsFFT[(iOrient - iOrientAtOnce) * ProjMapSize];
......@@ -563,8 +597,7 @@ int bioem::run()
createConvolutedProjectionMap(iOrient, iConv, proj_mapFFT, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
if (DebugOutput >= 2) printf("\t\tTime Convolution %d %d: %f (rank %d)\n", iOrient, iConv, timer.GetCurrentElapsedTime(), mpi_rank);
if (DebugOutput >= 2) timer.ResetStart();
if ((DebugOutput >= 2) || (Autotuning && aut.Needed(iConv))) timer.ResetStart();
myfloat_t amp,pha,env;
amp=param.CtfParam[iConv].pos[0];
......@@ -576,9 +609,10 @@ int bioem::run()
compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, conv_mapFFT, sumCONV, sumsquareCONV);
double compTime=0.;
if (DebugOutput >= 2)
{
const double compTime = timer.GetCurrentElapsedTime();
compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
......@@ -586,7 +620,15 @@ int bioem::run()
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
if (Autotuning) printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s, with GPU workload %d%%) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., aut.Workload(), mpi_rank);
else printf("\t\tTime Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s) (rank %d)\n", iOrient, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000., mpi_rank);
}
if (Autotuning && aut.Needed(iConv))
{
if (compTime == 0.) compTime = timer.GetCurrentElapsedTime();
aut.Tune(compTime);
if (aut.Finished() && DebugOutput >= 2) printf("\t\tOptimal GPU workload %d%% (rank %d)\n", aut.Workload(), mpi_rank);
rebalance(aut.Workload());
}
}
if (DebugOutput >= 1)
......@@ -1413,3 +1455,5 @@ void bioem::free_device_host(void* ptr)
{
free(ptr);
}
void bioem::rebalance(int workload) {}
......@@ -67,12 +67,77 @@ static const char *cufftGetErrorStrung(cufftResult error)
return "UNKNOWN";
}
/* Handing CUDA Driver errors */
#define cuErrorCheck(call) \
do { \
CUresult __error__; \
if ((__error__ = (call)) != CUDA_SUCCESS) { \
printf("CUDA Driver Error %d / %s (%s %d)\n", __error__, cuGetError(__error__),__FILE__, __LINE__); \
return __error__; \
} \
} while (false)
static const char * cuGetError(CUresult result) {
switch (result) {
case CUDA_SUCCESS: return "No errors";
case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
case CUDA_ERROR_PROFILER_DISABLED: return "Profiler disabled";
case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "Profiler not initialized";
case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "Profiler already started";
case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "Profiler already stopped";
case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
case CUDA_ERROR_MAP_FAILED: return "Map failed";
case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Not mapped as array";
case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Not mapped as pointer";
case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error";
case CUDA_ERROR_UNSUPPORTED_LIMIT: return "Unsupported CUlimit";
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "Context already in use";
case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Shared object symbol not found";
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
case CUDA_ERROR_OPERATING_SYSTEM: return "Operating System call failed";
case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
case CUDA_ERROR_NOT_FOUND: return "Not found";
case CUDA_ERROR_NOT_READY: return "CUDA not ready";
case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "Peer access already enabled";
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "Peer access not enabled";
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "Primary context active";
case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "Context is destroyed";
case CUDA_ERROR_ASSERT: return "Device assert failed";
case CUDA_ERROR_TOO_MANY_PEERS: return "Too many peers";
case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "Host memory already registered";
case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "Host memory not registered";
case CUDA_ERROR_UNKNOWN: return "Unknown error";
default: return "Unknown error code";
}
}
bioem_cuda::bioem_cuda()
{
deviceInitialized = 0;
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
if (GPUWorkload == -1) GPUWorkload = 100;
GPUDualStream = getenv("GPUDUALSTREAM") == NULL ? 1 : atoi(getenv("GPUDUALSTREAM"));
}
......@@ -172,15 +237,35 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
cout << "Error startMap not implemented for GPU Code\n";
exit(1);
}
#ifdef DEBUG_GPU
float time;
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
}
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to synch projections %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
#endif
if (FFTAlgo)
{
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[GPUAsync ? 2 : 0]));
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[2], cudaStream[2]));
......@@ -204,7 +289,7 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
}
cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream[j & 1]>>>(iOrient, iConv, amp, pha, env, pFFTtmp[j & 1], sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
}
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaPeekAtLastError());
if (GPUDualStream)
{
checkCudaErrors(cudaEventRecord(cudaFFTEvent[1], cudaStream[1]));
......@@ -214,7 +299,13 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
else
{
checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream[0]));
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time for memcpy %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0) );
#endif
if (GPUAlgo == 2) //Loop over shifts
{
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
......@@ -261,10 +352,23 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
exit(1);
}
}
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run CUDA %1.6f sec\n", time/1000);
checkCudaErrors(cudaEventRecord(start, 0));
#endif
if (GPUWorkload < 100)
{
bioem::compareRefMaps(iOrient, iConv, amp, pha, env, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
}
#ifdef DEBUG_GPU
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("\t\t\tGPU: time to run OMP %1.6f sec\n", time/1000);
#endif
if (GPUAsync)
{
checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream[0]));
......@@ -279,41 +383,34 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, myfloat_t amp, myfloat_t
int bioem_cuda::selectCudaDevice()
{
int count;
long long int bestDeviceSpeed = -1;
int bestDevice;
int bestDevice = 0;
cudaDeviceProp deviceProp;
/* Initializing CUDA driver API */
cuErrorCheck(cuInit(0));
/* Get number of available CUDA devices */
checkCudaErrors(cudaGetDeviceCount(&count));
if (count == 0)
{
printf("No CUDA device detected\n");
return(1);
}
for (int i = 0;i < count;i++)
/* Find the best GPU */
long long int bestDeviceSpeed = -1, deviceSpeed = -1;
for (int i = 0; i < count; i++)
{
#if CUDA_VERSION > 3010
size_t free, total;
#else
unsigned int free, total;
#endif
cuInit(0);
CUdevice tmpDevice;
cuDeviceGet(&tmpDevice, i);
CUcontext tmpContext;
cuCtxCreate(&tmpContext, 0, tmpDevice);
if(cuMemGetInfo(&free, &total)) exit(1);
cuCtxDestroy(tmpContext);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
if (DebugOutput >= 2 && mpi_rank == 0) printf("CUDA Device %2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)\n", i, deviceProp.name, deviceProp.major, deviceProp.minor, (long long int) free, (long long int) deviceProp.totalGlobalMem);
long long int deviceSpeed = (long long int) deviceProp.multiProcessorCount * (long long int) deviceProp.clockRate * (long long int) deviceProp.warpSize;
cudaGetDeviceProperties(&deviceProp, i);
deviceSpeed = (long long int) deviceProp.multiProcessorCount * (long long int) deviceProp.clockRate * (long long int) deviceProp.warpSize;
if (deviceSpeed > bestDeviceSpeed)
{
bestDevice = i;
bestDeviceSpeed = deviceSpeed;
}
}
/* Get user-specified GPU choice */
if (getenv("GPUDEVICE"))
{
int device = atoi(getenv("GPUDEVICE"));
......@@ -331,13 +428,30 @@ int bioem_cuda::selectCudaDevice()
if (device < 0)
{
printf("Negative CUDA device specified: %d, invalid!\n", device);
exit(1);
}
bestDevice = device;
}
checkCudaErrors(cudaSetDevice(bestDevice));
cudaGetDeviceProperties(&deviceProp ,bestDevice);
/* Set CUDA processes to appropriate devices */
cudaGetDeviceProperties(&deviceProp, bestDevice);
if (deviceProp.computeMode == 0)
{
checkCudaErrors(cudaSetDevice(bestDevice));
}
else
{
if (DebugOutput >= 1)
{
printf("CUDA device %d is not set in DEFAULT mode, make sure that CUDA processes are pinned as planned!\n", bestDevice);
printf("Pinning process %d to CUDA device %d\n", mpi_rank, bestDevice);
}
checkCudaErrors(cudaSetDevice(bestDevice));
/* This synchronization is needed in order to detect bogus silent errors from cudaSetDevice call */
checkCudaErrors(cudaDeviceSynchronize());
}
/* Debugging information about CUDA devices used by the current process */
if (DebugOutput >= 3)
{
printf("Using CUDA Device %s with Properties:\n", deviceProp.name);
......@@ -356,13 +470,33 @@ int bioem_cuda::selectCudaDevice()
printf("memoryClockRate = %d\n", deviceProp.memoryClockRate);
printf("multiProcessorCount = %d\n", deviceProp.multiProcessorCount);
printf("textureAlignment = %lld\n", (unsigned long long int) deviceProp.textureAlignment);
printf("computeMode = %d\n", deviceProp.computeMode);
#if CUDA_VERSION > 3010
size_t free, total;
#else
unsigned int free, total;
#endif
if (deviceProp.computeMode == 0)
{
CUdevice tmpDevice;
cuErrorCheck(cuDeviceGet(&tmpDevice, bestDevice));
CUcontext tmpContext;
cuErrorCheck(cuCtxCreate(&tmpContext, 0, tmpDevice));
cuErrorCheck(cuMemGetInfo(&free, &total));
cuErrorCheck(cuCtxDestroy(tmpContext));
}
else
{
cuErrorCheck(cuMemGetInfo(&free, &total));
}
printf("free memory = %lld; total memory = %lld\n", free, total);
}
if (DebugOutput >= 1)
{
printf("BioEM for CUDA initialized (MPI Rank %d), %d GPUs found, using GPU %d\n", mpi_rank, count, bestDevice);
}
return(0);
}
......@@ -485,7 +619,7 @@ int bioem_cuda::deviceStartRun()
}
else
{
maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
maxRef = RefMap.ntotRefMap == 1 ? (size_t) RefMap.ntotRefMap : (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
pProb_host = new bioem_Probability;
pProb_host->init(maxRef, param.nTotGridAngles, param.nTotCC, *this);
pProb_host->copyFrom(&pProb, *this);
......@@ -566,6 +700,23 @@ void bioem_cuda::free_device_host(void* ptr)
cudaFreeHost(ptr);
}
void bioem_cuda::rebalance(int workload)
{
if ((workload < 0) || (workload > 100) || (workload == GPUWorkload)) return;
deviceFinishRun();
if (DebugOutput >= 2)
{
printf("\t\tSetting GPU workload to %d%% (rank %d)\n", workload, mpi_rank);
}
GPUWorkload = workload;
maxRef = (size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100;
deviceStartRun();
}
bioem* bioem_cuda_create()
{
int count;
......
/* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
< BioEM software for Bayesian inference of Electron Microscopy images>
Copyright (C) 2016 Pilar Cossio, David Rohr, Fabio Baruffa, Markus Rampp,
Volker Lindenstruth and Gerhard Hummer.
Max Planck Institute of Biophysics, Frankfurt, Germany.
See license statement for terms of distribution.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#ifndef AUTOTUNER_H
#define AUTOTUNER_H
class Autotuner {
public:
Autotuner() {stopTuning = true;}
/* Setting variables to initial values */
inline void Initialize(int alg=3, int st=7) {algo = alg; stable=st; Reset(); }
/* Resetting variables to initial values */
void Reset();