Planned maintenance on Wednesday, 2021-01-20, 17:00-18:00. Expect some interruptions during that time

bioem_cuda.cu 7.31 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#define BIOEM_GPUCODE

#if defined(_WIN32)
#include <windows.h>
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

//__constant__ bioem_map gConvMap;

static inline void checkCudaErrors(cudaError_t error)
{
	if (error != cudaSuccess)
	{
		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
		exit(1);
	}
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
28 29
	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
30 31 32 33 34 35 36
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

37
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
38 39
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
40
	if (iRefMap < maxRef)
41 42 43 44 45
	{
		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
	}
}

46
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
47 48
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
49
	if (iRefMap < maxRef)
50 51 52 53 54
	{
		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
	}
}

55
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
56 57 58 59 60 61 62 63 64 65
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
	
66 67 68
	const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
	
	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
}

template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

84
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
85
{
86 87 88 89 90
	if (startMap)
	{
		cout << "Error startMap not implemented for GPU Code\n";
		exit(1);
	}
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
	
	if (GPUAlgo == 2) //Loop over shifts
	{
		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
		if (!IsPowerOf2(nShifts))
		{
			cout << "Invalid number of displacements, no power of two\n";
			exit(1);
		}
		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
		{
			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
			exit(1);
		}
		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
		{
			cout << "Too many displacements for CUDA reduction\n";
			exit(1);
		}
		const int nShiftBits = ilog2(nShifts);
117
		size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
118 119 120
		size_t nBlocks = CUDA_BLOCK_COUNT;
		for (size_t i = 0;i < totalBlocks;i += nBlocks)
		{
121
			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
122 123 124 125 126 127 128 129
		}
	}
	else if (GPUAlgo == 1) //Split shifts in multiple kernels
	{
		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
		{
			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
			{
130
				compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
131 132 133 134 135
			}
		}	
	}
	else if (GPUAlgo == 0) //All shifts in one kernel
	{
136
		compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
137 138 139 140 141 142
	}
	else
	{
		cout << "Invalid GPU Algorithm selected\n";
		exit(1);
	}
143 144 145 146
	if (GPUWorkload < 100)
	{
		bioem::compareRefMaps(iProjectionOut, iConv, conv_map, maxRef);
	}
147 148 149
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
150
	}
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
	
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
	for (int i = 0;i < 2;i++)
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
	}
	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
	
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
		delete RefMapGPU;
	}
	else
	{
		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
	}
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
	
	cudaStreamDestroy(cudaStream);
	cudaFree(pRefMap_device);
	cudaFree(pProb_device);
	for (int i = 0;i < 2;i++)
	{
		cudaEventDestroy(cudaEvent[i]);
		cudaFree(pConvMap_device);
	}
	cudaThreadExit();
	
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
206
	maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
207
	
208
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
209 210 211 212 213 214
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
215
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
216 217 218 219 220 221 222 223
	
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}