bioem_cuda.cu 7.02 KB
Newer Older
1 2
#define BIOEM_GPUCODE

qon's avatar
qon committed
3 4
#if defined(_WIN32)
#include <windows.h>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

//__constant__ bioem_map gConvMap;

static inline void checkCudaErrors(cudaError_t error)
{
	if (error != cudaSuccess)
	{
		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
		exit(1);
	}
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
	GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC"));
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
	if (iRefMap < RefMap->ntotRefMap)
	{
		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
	}
}

__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
	if (iRefMap < RefMap->ntotRefMap)
	{
		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
	}
}

__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
	
	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
}

template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
qon's avatar
qon committed
70 71 72 73 74 75
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
{
	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
	
	if (GPUAlgo == 2) //Loop over shifts
	{
		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
		if (!IsPowerOf2(nShifts))
		{
			cout << "Invalid number of displacements, no power of two\n";
			exit(1);
		}
		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
		{
			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
			exit(1);
		}
		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
		{
			cout << "Too many displacements for CUDA reduction\n";
			exit(1);
		}
		const int nShiftBits = ilog2(nShifts);
		size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
		size_t nBlocks = CUDA_BLOCK_COUNT;
		for (size_t i = 0;i < totalBlocks;i += nBlocks)
		{
			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
		}
	}
	else if (GPUAlgo == 1) //Split shifts in multiple kernels
	{
		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
		{
			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
			{
				compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
			}
		}	
	}
	else if (GPUAlgo == 0) //All shifts in one kernel
	{
		compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod);
	}
	else
	{
		cout << "Invalid GPU Algorithm selected\n";
		exit(1);
	}
135 136 137 138 139 140 141 142
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
		}
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
	
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
	for (int i = 0;i < 2;i++)
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
	}
	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
	
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
		delete RefMapGPU;
	}
	else
	{
		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
	}
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
	
	cudaStreamDestroy(cudaStream);
	cudaFree(pRefMap_device);
	cudaFree(pProb_device);
	for (int i = 0;i < 2;i++)
	{
		cudaEventDestroy(cudaEvent[i]);
		cudaFree(pConvMap_device);
	}
	cudaThreadExit();
	
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
	
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
	
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}