intGPUAlgo;//GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
intGPUAsync;//Run GPU Asynchronously, do the convolutions on the host in parallel.
intGPUDualStream;//Use two streams to improve paralelism
intGPUWorkload;//Percentage of workload to perform on GPU. Default 100. Rest is done on processor in parallel.
intmaxRef;
intselectCudaDevice();
intdeviceInitialized;
cudaStream_tcudaStream[PIPELINE_LVL+1];// Streams are used for both