bioem_cuda.cu 7.31 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#define BIOEM_GPUCODE

#if defined(_WIN32)
#include <windows.h>
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

//__constant__ bioem_map gConvMap;

static inline void checkCudaErrors(cudaError_t error)
{
	if (error != cudaSuccess)
	{
		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
		exit(1);
	}
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
28
29
	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
30
31
32
33
34
35
36
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

37
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
38
39
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
40
	if (iRefMap < maxRef)
41
42
43
44
45
	{
		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
	}
}

46
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
47
48
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
49
	if (iRefMap < maxRef)
50
51
52
53
54
	{
		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
	}
}

55
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
56
57
58
59
60
61
62
63
64
65
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
	
66
67
68
	const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
	
	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
}

template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

84
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, const int startMap)
85
{
86
87
88
89
90
	if (startMap)
	{
		cout << "Error startMap not implemented for GPU Code\n";
		exit(1);
	}
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
	
	if (GPUAlgo == 2) //Loop over shifts
	{
		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
		if (!IsPowerOf2(nShifts))
		{
			cout << "Invalid number of displacements, no power of two\n";
			exit(1);
		}
		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
		{
			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
			exit(1);
		}
		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
		{
			cout << "Too many displacements for CUDA reduction\n";
			exit(1);
		}
		const int nShiftBits = ilog2(nShifts);
117
		size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
118
119
120
		size_t nBlocks = CUDA_BLOCK_COUNT;
		for (size_t i = 0;i < totalBlocks;i += nBlocks)
		{
121
			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
122
123
124
125
126
127
128
129
		}
	}
	else if (GPUAlgo == 1) //Split shifts in multiple kernels
	{
		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
		{
			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
			{
130
				compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
131
132
133
134
135
			}
		}	
	}
	else if (GPUAlgo == 0) //All shifts in one kernel
	{
136
		compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
137
138
139
140
141
142
	}
	else
	{
		cout << "Invalid GPU Algorithm selected\n";
		exit(1);
	}
143
144
145
146
	if (GPUWorkload < 100)
	{
		bioem::compareRefMaps(iProjectionOut, iConv, conv_map, maxRef);
	}
147
148
149
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
150
	}
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
	
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
	for (int i = 0;i < 2;i++)
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
	}
	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
	
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
		delete RefMapGPU;
	}
	else
	{
		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
	}
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
	
	cudaStreamDestroy(cudaStream);
	cudaFree(pRefMap_device);
	cudaFree(pProb_device);
	for (int i = 0;i < 2;i++)
	{
		cudaEventDestroy(cudaEvent[i]);
		cudaFree(pConvMap_device);
	}
	cudaThreadExit();
	
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
206
	maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
207
	
208
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
209
210
211
212
213
214
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
215
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
216
217
218
219
220
221
222
223
	
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}