bioem_cuda.cu 12 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
#define BIOEM_GPUCODE

#if defined(_WIN32)
#include <windows.h>
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

13 14 15 16 17 18 19
#define checkCudaErrors(error) \
{ \
	if ((error) != cudaSuccess) \
	{ \
		printf("CUDA Error %d / %s (%s: %d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__); \
		exit(1); \
	} \
20 21 22 23 24 25
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
26 27
	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
28 29 30 31 32 33 34
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

35
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int cent_x, const int cent_y, const int maxRef)
36 37
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
38
	if (iRefMap < maxRef)
39
	{
40
		compareRefMap<0>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap, cent_x, cent_y);
41 42 43
	}
}

44
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod RefMap, const int maxRef)
45 46
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
47
	if (iRefMap < maxRef)
48
	{
49
		compareRefMapShifted<1>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap);
50 51 52
	}
}

53 54 55 56 57 58
__global__ void cudaZeroMem(void* ptr, size_t size)
{
	int* myptr = (int*) ptr;
	int mysize = size / sizeof(int);
	int myid = myBlockDimX * myBlockIdxX + myThreadIdxX;
	int mygrid = myBlockDimX * myGridDimX;
59
	for (int i = myid; i < mysize; i += mygrid) myptr[i] = 0;
60 61
}

62
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const myfloat_t* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
63 64 65 66 67 68 69 70 71
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
72

73
	const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
74

75
	compareRefMap<2>(iRefMap, iOrient, iConv, pMap, pProb, param, RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
76 77
}

78
__global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* refmap, mycuComplex_t* out, const int NumberPixelsTotal, const int MapSize, const int NumberMaps, const int Offset)
79 80
{
	if (myBlockIdxX >= NumberMaps) return;
81 82
	const mycomplex_t* myin = &refmap[myBlockIdxX * MapSize + Offset];
	mycuComplex_t* myout = &out[(myBlockIdxX * MapSize)];
83
	for(int i = myThreadIdxX; i < NumberPixelsTotal; i += myBlockDimX)
84 85 86 87 88 89
	{
		myout[i].x = convmap[i][0] * myin[i][0] + convmap[i][1] * myin[i][1];
		myout[i].y = convmap[i][1] * myin[i][0] - convmap[i][0] * myin[i][1];
	}
}

90
__global__ void cuDoRefMapsFFT(const int iOrient, const int iConv, const myfloat_t* lCC, const myfloat_t sumC, const myfloat_t sumsquareC, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap RefMap, const int maxRef, const int Offset)
91
{
92 93
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX + Offset;
	const myfloat_t* mylCC = &lCC[(myBlockIdxX * myBlockDimX + myThreadIdxX) * param.NumberPixels * param.NumberPixels];
94
	if (iRefMap >= maxRef) return;
95
	doRefMapFFT(iRefMap, iOrient, iConv, mylCC, sumC, sumsquareC, pProb, param, RefMap);
96 97
}

98 99 100 101 102 103 104 105 106 107 108 109 110
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

111
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
112
{
113 114 115 116 117
	if (startMap)
	{
		cout << "Error startMap not implemented for GPU Code\n";
		exit(1);
	}
118 119 120 121
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
122

123
	if (FFTAlgo)
124
	{
125
		checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream));
126
		for (int i = 0; i < maxRef; i += CUDA_FFTS_AT_ONCE)
127
		{
128
			const int num = min(CUDA_FFTS_AT_ONCE, maxRef - i);
129
			multComplexMap<<<num, CUDA_THREAD_COUNT, 0, cudaStream>>>(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], pRefMapsFFT, pFFTtmp2, param.param_device.NumberPixels * param.param_device.NumberFFTPixels1D, param.FFTMapSize, num, i);
130 131 132 133 134
			if (mycufftExecC2R(i + CUDA_FFTS_AT_ONCE > maxRef ? plan[1] : plan[0], pFFTtmp2, pFFTtmp) != CUFFT_SUCCESS)
			{
				cout << "Error running CUFFT\n";
				exit(1);
			}
135
			cuDoRefMapsFFT<<<divup(num, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>>(iProjectionOut, iConv, pFFTtmp, sumC, sumsquareC, pProb_device, param.param_device, *gpumap, num, i);
136
		}
137 138 139 140
		checkCudaErrors(cudaGetLastError());
	}
	else
	{
141
		checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], conv_map, sizeof(myfloat_t) * RefMap.refMapSize, cudaMemcpyHostToDevice, cudaStream));
142 143

		if (GPUAlgo == 2) //Loop over shifts
144
		{
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
			const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
			if (!IsPowerOf2(nShifts))
			{
				cout << "Invalid number of displacements, no power of two\n";
				exit(1);
			}
			if (CUDA_THREAD_COUNT % (nShifts * nShifts))
			{
				cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
				exit(1);
			}
			if (nShifts > CUDA_MAX_SHIFT_REDUCE)
			{
				cout << "Too many displacements for CUDA reduction\n";
				exit(1);
			}
			const int nShiftBits = ilog2(nShifts);
			size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
			size_t nBlocks = CUDA_BLOCK_COUNT;
164
			for (size_t i = 0; i < totalBlocks; i += nBlocks)
165
			{
166
				compareRefMapLoopShifts_kernel<<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream >>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *gpumap, i, nShifts, nShiftBits, maxRef);
167
			}
168
		}
169
		else if (GPUAlgo == 1) //Split shifts in multiple kernels
170
		{
171
			for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x = cent_x + param.param_device.GridSpaceCenter)
172
			{
173
				for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y = cent_y + param.param_device.GridSpaceCenter)
174
				{
175
					compareRefMap_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, cent_x, cent_y, maxRef);
176 177
				}
			}
178
		}
179
		else if (GPUAlgo == 0) //All shifts in one kernel
180
		{
181
			compareRefMapShifted_kernel<<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, *pRefMap_device_Mod, maxRef);
182
		}
183
		else
184
		{
185 186
			cout << "Invalid GPU Algorithm selected\n";
			exit(1);
187
		}
188
	}
189 190
	if (GPUWorkload < 100)
	{
191
		bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
192
	}
193 194 195
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
196
	}
197 198 199 200 201 202 203 204 205 206
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
207

208 209
	if (FFTAlgo) GPUAlgo = 2;

210 211 212 213 214
	gpumap = new bioem_RefMap;
	memcpy(gpumap, &RefMap, sizeof(bioem_RefMap));
	if (FFTAlgo == 0)
	{
		checkCudaErrors(cudaMalloc(&maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize));
215 216 217 218 219 220 221 222 223 224 225 226 227

		if (GPUAlgo == 0 || GPUAlgo == 1)
		{
			pRefMap_device_Mod = (bioem_RefMap_Mod*) gpumap;
			bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod;
			RefMapGPU->init(RefMap);
			checkCudaErrors(cudaMemcpy(maps, RefMapGPU->maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice));
			delete RefMapGPU;
		}
		else
		{
			checkCudaErrors(cudaMemcpy(maps, RefMap.maps, sizeof(myfloat_t) * RefMap.ntotRefMap * RefMap.refMapSize, cudaMemcpyHostToDevice));
		}
228 229 230 231 232 233 234 235 236
	}
	checkCudaErrors(cudaMalloc(&sum, sizeof(myfloat_t) * RefMap.ntotRefMap));
	checkCudaErrors(cudaMemcpy(sum, RefMap.sum_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMalloc(&sumsquare, sizeof(myfloat_t) * RefMap.ntotRefMap));
	checkCudaErrors(cudaMemcpy(sumsquare, RefMap.sumsquare_RefMap, sizeof(myfloat_t) * RefMap.ntotRefMap, cudaMemcpyHostToDevice));
	gpumap->maps = maps;
	gpumap->sum_RefMap = sum;
	gpumap->sumsquare_RefMap = sumsquare;

237 238
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
239
	for (int i = 0; i < 2; i++)
240 241
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
242
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(myfloat_t) * RefMap.refMapSize));
243
	}
244

245 246
	if (FFTAlgo)
	{
247 248
		checkCudaErrors(cudaMalloc(&pRefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t)));
		checkCudaErrors(cudaMalloc(&pFFTtmp2, CUDA_FFTS_AT_ONCE * param.FFTMapSize * sizeof(mycomplex_t)));
249
		checkCudaErrors(cudaMalloc(&pFFTtmp, CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels * sizeof(myfloat_t)));
250 251
		checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2));
		checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice));
252 253
	}

254 255 256 257 258 259 260
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
261

262 263
	cudaStreamDestroy(cudaStream);
	cudaFree(pProb_device);
264 265
	cudaFree(sum);
	cudaFree(sumsquare);
266
	for (int i = 0; i < 2; i++)
267 268
	{
		cudaEventDestroy(cudaEvent[i]);
269
		cudaFree(pConvMap_device[i]);
270
	}
271 272 273 274
	if (FFTAlgo)
	{
		cudaFree(pRefMapsFFT);
		cudaFree(pConvMapFFT);
275
		//cudaFree(pFFTtmp);
276 277
		cudaFree(pFFTtmp2);
	}
278 279 280 281 282 283 284 285 286
	else
	{
		cudaFree(maps);
	}
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		cudaFree(pRefMap_device_Mod);
	}
	delete gpumap;
287
	cudaThreadExit();
288

289 290 291 292 293 294
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
295
	maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
296

297
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
298 299 300

	if (FFTAlgo)
	{
301
		for (int i = 0; i < 2; i++)
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
		{
			int n[2] = {param.param_device.NumberPixels, param.param_device.NumberPixels};
			if (cufftPlanMany(&plan[i], 2, n, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, i ? (maxRef % CUDA_FFTS_AT_ONCE) : CUDA_FFTS_AT_ONCE) != CUFFT_SUCCESS)
			{
				cout << "Error planning CUFFT\n";
				exit(1);
			}
			if (cufftSetCompatibilityMode(plan[i], CUFFT_COMPATIBILITY_NATIVE) != CUFFT_SUCCESS)
			{
				cout << "Error planning CUFFT compatibility\n";
				exit(1);
			}
			if (cufftSetStream(plan[i], cudaStream) != CUFFT_SUCCESS)
			{
				cout << "Error setting CUFFT stream\n";
				exit(1);
			}
		}
	}
321 322 323 324 325 326
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
327
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
328

329 330
	if (FFTAlgo)
	{
331
		for (int i = 0; i < 2; i++) cufftDestroy(plan[i]);
332 333
	}

334 335 336 337 338 339 340
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}