bioem_cuda.cu 7.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#define BIOEM_GPUCODE

#if defined(_WIN32)
#include <windows.h>
#endif

#include <iostream>
using namespace std;

#include "bioem_cuda_internal.h"
#include "bioem_algorithm.h"

//__constant__ bioem_map gConvMap;

static inline void checkCudaErrors(cudaError_t error)
{
	if (error != cudaSuccess)
	{
		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
		exit(1);
	}
}

bioem_cuda::bioem_cuda()
{
	deviceInitialized = 0;
	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
28
29
	GPUAsync = getenv("GPUASYNC") == NULL ? 1 : atoi(getenv("GPUASYNC"));
	GPUWorkload = getenv("GPUWORKLOAD") == NULL ? 100 : atoi(getenv("GPUWORKLOAD"));
30
31
32
33
34
35
36
}

bioem_cuda::~bioem_cuda()
{
	deviceExit();
}

37
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y, const int maxRef)
38
39
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
40
	if (iRefMap < maxRef)
41
42
43
44
45
	{
		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
	}
}

46
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int maxRef)
47
48
{
	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
49
	if (iRefMap < maxRef)
50
51
52
53
54
	{
		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
	}
}

55
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits, const int maxRef)
56
57
58
59
60
61
62
63
64
{
	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
	const int iRefMap = myid >> (nShiftBits << 1);
	const int myRef = myThreadIdxX >> (nShiftBits << 1);
	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
	const int myShiftIdy = myid & (nShifts - 1);
	const int myShift = myid & (nShifts * nShifts - 1);
	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
65

66
	const bool threadActive = myShiftIdx < nShifts && myShiftIdy < nShifts && iRefMap < maxRef;
67

68
	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef, threadActive);
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
}

template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32)
static inline int ilog2 (int value)
{
	DWORD index;
	_BitScanReverse (&index, value);
	return(value);
}
#else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif

84
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap)
85
{
86
87
88
89
90
	if (startMap)
	{
		cout << "Error startMap not implemented for GPU Code\n";
		exit(1);
	}
91
92
93
94
95
96
	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
	}
	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
97

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
	if (GPUAlgo == 2) //Loop over shifts
	{
		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
		if (!IsPowerOf2(nShifts))
		{
			cout << "Invalid number of displacements, no power of two\n";
			exit(1);
		}
		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
		{
			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
			exit(1);
		}
		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
		{
			cout << "Too many displacements for CUDA reduction\n";
			exit(1);
		}
		const int nShiftBits = ilog2(nShifts);
117
		size_t totalBlocks = divup((size_t) maxRef * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
118
119
120
		size_t nBlocks = CUDA_BLOCK_COUNT;
		for (size_t i = 0;i < totalBlocks;i += nBlocks)
		{
121
			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits, maxRef);
122
123
124
125
126
127
128
129
		}
	}
	else if (GPUAlgo == 1) //Split shifts in multiple kernels
	{
		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
		{
			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
			{
130
				compareRefMap_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y, maxRef);
131
			}
132
		}
133
134
135
	}
	else if (GPUAlgo == 0) //All shifts in one kernel
	{
136
		compareRefMapShifted_kernel <<<divup(maxRef, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, maxRef);
137
138
139
140
141
142
	}
	else
	{
		cout << "Invalid GPU Algorithm selected\n";
		exit(1);
	}
143
144
	if (GPUWorkload < 100)
	{
145
		bioem::compareRefMaps(iProjectionOut, iConv, conv_map, localmultFFT, sumC, sumsquareC, maxRef);
146
	}
147
148
149
	if (GPUAsync)
	{
		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
150
	}
151
152
153
154
155
156
157
158
159
160
	else
	{
		checkCudaErrors(cudaStreamSynchronize(cudaStream));
	}
	return(0);
}

int bioem_cuda::deviceInit()
{
	deviceExit();
161

162
163
164
165
166
167
168
169
170
	checkCudaErrors(cudaStreamCreate(&cudaStream));
	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
	for (int i = 0;i < 2;i++)
	{
		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
	}
	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
171

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
	if (GPUAlgo == 0 || GPUAlgo == 1)
	{
		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
		delete RefMapGPU;
	}
	else
	{
		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
	}
	deviceInitialized = 1;
	return(0);
}

int bioem_cuda::deviceExit()
{
	if (deviceInitialized == 0) return(0);
189

190
191
192
193
194
195
196
197
198
	cudaStreamDestroy(cudaStream);
	cudaFree(pRefMap_device);
	cudaFree(pProb_device);
	for (int i = 0;i < 2;i++)
	{
		cudaEventDestroy(cudaEvent[i]);
		cudaFree(pConvMap_device);
	}
	cudaThreadExit();
199

200
201
202
203
204
205
	deviceInitialized = 0;
	return(0);
}

int bioem_cuda::deviceStartRun()
{
206
	maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
207

208
	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * maxRef, cudaMemcpyHostToDevice);
209
210
211
212
213
214
	return(0);
}

int bioem_cuda::deviceFinishRun()
{
	if (GPUAsync) cudaStreamSynchronize(cudaStream);
215
	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * maxRef, cudaMemcpyDeviceToHost);
216

217
218
219
220
221
222
223
	return(0);
}

bioem* bioem_cuda_create()
{
	return new bioem_cuda;
}