bioem_cuda.cu 6.82 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#define BIOEM_GPUCODE

#if defined(_WIN32)
#include <windows.h>
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

//__constant__ bioem_map gConvMap;

static inline void checkCudaErrors(cudaError_t error)
{
	if (error != cudaSuccess)
	{
		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
		exit(1);
	}
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
	GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC"));
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
	if (iRefMap < RefMap->ntotRefMap)
	{
		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
	}
}

__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
	if (iRefMap < RefMap->ntotRefMap)
	{
		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
	}
}

__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
	
	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
}

template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
{
	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
	
	if (GPUAlgo == 2) //Loop over shifts
	{
		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
		if (!IsPowerOf2(nShifts))
		{
			cout << "Invalid number of displacements, no power of two\n";
			exit(1);
		}
		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
		{
			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
			exit(1);
		}
		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
		{
			cout << "Too many displacements for CUDA reduction\n";
			exit(1);
		}
		const int nShiftBits = ilog2(nShifts);
		size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
		size_t nBlocks = CUDA_BLOCK_COUNT;
		for (size_t i = 0;i < totalBlocks;i += nBlocks)
		{
			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
		}
	}
	else if (GPUAlgo == 1) //Split shifts in multiple kernels
	{
		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
		{
			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
			{
				compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
			}
		}	
	}
	else if (GPUAlgo == 0) //All shifts in one kernel
	{
		compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod);
	}
	else
	{
		cout << "Invalid GPU Algorithm selected\n";
		exit(1);
	}
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
		}
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
	
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
	for (int i = 0;i < 2;i++)
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
	}
	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
	
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
		delete RefMapGPU;
	}
	else
	{
		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
	}
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
	
	cudaStreamDestroy(cudaStream);
	cudaFree(pRefMap_device);
	cudaFree(pProb_device);
	for (int i = 0;i < 2;i++)
	{
		cudaEventDestroy(cudaEvent[i]);
		cudaFree(pConvMap_device);
	}
	cudaThreadExit();
	
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
	
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
	
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}