Commit 6532f7f3 authored by qon's avatar qon
Browse files

Add Option for asnychronous GPU processing, add test implementation with...

Add Option for asnychronous GPU processing, add test implementation with explicit SSE for CPU (currently disabled)
parent 92d5b2ba
...@@ -266,8 +266,15 @@ int bioem::run() ...@@ -266,8 +266,15 @@ int bioem::run()
/*** Comparing each calculated convoluted map with all experimental maps ***/ /*** Comparing each calculated convoluted map with all experimental maps ***/
timer.ResetStart(); timer.ResetStart();
compareRefMaps(iProjectionOut, iConv, conv_map); compareRefMaps(iProjectionOut, iConv, conv_map);
printf("Time Comparison %d %d: %f\n", iProjectionOut, iConv, timer.GetCurrentElapsedTime()); const double compTime = timer.GetCurrentElapsedTime();
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
printf("Time Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s)\n", iProjectionOut, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000.);
} }
} }
......
#ifndef BIOEM_ALGORITHM_H #ifndef BIOEM_ALGORITHM_H
#define BIOEM_ALGORITHM_H #define BIOEM_ALGORITHM_H
//#include <boost/iterator/iterator_concepts.hpp>
#ifndef BIOEM_GPUCODE
//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code.
#endif
#ifdef SSECODE
#include <emmintrin.h>
#include <smmintrin.h>
#endif
template <int GPUAlgo, class RefT> template <int GPUAlgo, class RefT>
__device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap, __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
...@@ -12,10 +22,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -12,10 +22,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
// Taking into account the center displacement // Taking into account the center displacement
/*** Inizialzing crosscorrelations of calculated projected convolutions ***/ /*** Inizialzing crosscorrelations of calculated projected convolutions ***/
myfloat_t sum=0.0; #ifdef SSECODE
myfloat_t sum, sumsquare, crossproMapConv;
__m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2;
#else
myfloat_t sum=0.0;
myfloat_t sumsquare=0.0; myfloat_t sumsquare=0.0;
myfloat_t crossproMapConv=0.0; myfloat_t crossproMapConv=0.0;
#endif
/****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/ /****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/
if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap) if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap)
{ {
...@@ -43,9 +57,25 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -43,9 +57,25 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
for (int i = iStart; i < iEnd; i += 1) for (int i = iStart; i < iEnd; i += 1)
{ {
#ifdef SSECODE
const float* ptr1 = &Mapconv.points[i + cent_x][jStart + cent_y];
const float* ptr2 = RefMap.getp(iRefMap, i, jStart);
int j;
const int count = jEnd - jStart;
for (j = 0;j <= count - 4;j += 4)
{
d1 = _mm_loadu_ps(ptr1);
d2 = _mm_loadu_ps(ptr2);
sum_v = _mm_add_ps(sum_v, d1);
sumsquare_v = _mm_add_ps(sumsquare_v, _mm_mul_ps(d1, d1));
cross_v = _mm_add_ps(cross_v, _mm_mul_ps(d1, d2));
ptr1 += 4;
ptr2 += 4;
}
#else
for (int j = jStart; j < jEnd; j += 1) for (int j = jStart; j < jEnd; j += 1)
{ {
const myfloat_t pointMap = Mapconv.points[i+cent_x][j+cent_y]; const myfloat_t pointMap = Mapconv.points[i + cent_x][j + cent_y];
const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j); const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
crossproMapConv += pointMap * pointRefMap; crossproMapConv += pointMap * pointRefMap;
// Crosscorrelation of calculated displaced map // Crosscorrelation of calculated displaced map
...@@ -53,7 +83,19 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient ...@@ -53,7 +83,19 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
// Calculate Sum of pixels squared // Calculate Sum of pixels squared
sumsquare += pointMap*pointMap; sumsquare += pointMap*pointMap;
} }
#endif
} }
#ifdef SSECODE
sum_v = _mm_hadd_ps(sum_v, sum_v);
sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
cross_v = _mm_hadd_ps(cross_v, cross_v);
sum_v = _mm_hadd_ps(sum_v, sum_v);
sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
cross_v = _mm_hadd_ps(cross_v, cross_v);
sum = _mm_cvtss_f32(sum_v);
sumsquare = _mm_cvtss_f32(sumsquare_v);
crossproMapConv = _mm_cvtss_f32(cross_v);
#endif
} }
/********** Calculating elements in BioEM Probability formula ********/ /********** Calculating elements in BioEM Probability formula ********/
......
#define BIOEM_GPUCODE #define BIOEM_GPUCODE
#if defined(_WIN32) #if defined(_WIN32)
#include <windows.h> #include <windows.h>
#endif #endif
#include <iostream> #include <iostream>
using namespace std; using namespace std;
#include "bioem_cuda_internal.h" #include "bioem_cuda_internal.h"
#include "bioem_algorithm.h" #include "bioem_algorithm.h"
//__constant__ bioem_map gConvMap; //__constant__ bioem_map gConvMap;
static inline void checkCudaErrors(cudaError_t error) static inline void checkCudaErrors(cudaError_t error)
{ {
if (error != cudaSuccess) if (error != cudaSuccess)
{ {
printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error)); printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
exit(1); exit(1);
} }
} }
bioem_cuda::bioem_cuda() bioem_cuda::bioem_cuda()
{ {
deviceInitialized = 0; deviceInitialized = 0;
GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO")); GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
} GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC"));
}
bioem_cuda::~bioem_cuda()
{ bioem_cuda::~bioem_cuda()
deviceExit(); {
} deviceExit();
}
__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
{ __global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; {
if (iRefMap < RefMap->ntotRefMap) const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
{ if (iRefMap < RefMap->ntotRefMap)
compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y); {
} compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
} }
}
__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
{ __global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX; {
if (iRefMap < RefMap->ntotRefMap) const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
{ if (iRefMap < RefMap->ntotRefMap)
compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap); {
} compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
} }
}
__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
{ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX; {
const int iRefMap = myid >> (nShiftBits << 1); const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
const int myRef = myThreadIdxX >> (nShiftBits << 1); const int iRefMap = myid >> (nShiftBits << 1);
const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1); const int myRef = myThreadIdxX >> (nShiftBits << 1);
const int myShiftIdy = myid & (nShifts - 1); const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
const int myShift = myid & (nShifts * nShifts - 1); const int myShiftIdy = myid & (nShifts - 1);
const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter; const int myShift = myid & (nShifts * nShifts - 1);
const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter; const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
} compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
}
template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));} template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
#if defined(_WIN32) #if defined(_WIN32)
static inline int ilog2 (int value) static inline int ilog2 (int value)
{ {
DWORD index; DWORD index;
_BitScanReverse (&index, value); _BitScanReverse (&index, value);
return(value); return(value);
} }
#else #else
static inline int ilog2(int value) {return 31 - __builtin_clz(value);} static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
#endif #endif
int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map) int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
{ {
//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream); //cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
checkCudaErrors(cudaMemcpyAsync(pConvMap_device, &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream)); if (GPUAsync)
{
if (GPUAlgo == 2) //Loop over shifts checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
{ checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1; }
if (!IsPowerOf2(nShifts)) checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
{
cout << "Invalid number of displacements, no power of two\n"; if (GPUAlgo == 2) //Loop over shifts
exit(1); {
} const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
if (CUDA_THREAD_COUNT % (nShifts * nShifts)) if (!IsPowerOf2(nShifts))
{ {
cout << "CUDA Thread count is no multiple of number of shifts\n"; cout << "Invalid number of displacements, no power of two\n";
exit(1); exit(1);
} }
if (nShifts > CUDA_MAX_SHIFT_REDUCE) if (CUDA_THREAD_COUNT % (nShifts * nShifts))
{ {
cout << "Too many displacements for CUDA reduction\n"; cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
exit(1); exit(1);
} }
const int nShiftBits = ilog2(nShifts); if (nShifts > CUDA_MAX_SHIFT_REDUCE)
size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT); {
size_t nBlocks = CUDA_BLOCK_COUNT; cout << "Too many displacements for CUDA reduction\n";
for (size_t i = 0;i < totalBlocks;i += nBlocks) exit(1);
{ }
compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits); const int nShiftBits = ilog2(nShifts);
} size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
} size_t nBlocks = CUDA_BLOCK_COUNT;
else if (GPUAlgo == 1) //Split shifts in multiple kernels for (size_t i = 0;i < totalBlocks;i += nBlocks)
{ {
for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter) compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
{ }
for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter) }
{ else if (GPUAlgo == 1) //Split shifts in multiple kernels
compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y); {
} for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
} {
} for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
else if (GPUAlgo == 0) //All shifts in one kernel {
{ compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod); }
} }
else }
{ else if (GPUAlgo == 0) //All shifts in one kernel
cout << "Invalid GPU Algorithm selected\n"; {
exit(1); compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod);
} }
checkCudaErrors(cudaStreamSynchronize(cudaStream)); else
return(0); {
} cout << "Invalid GPU Algorithm selected\n";
exit(1);
int bioem_cuda::deviceInit() }
{ if (GPUAsync == 0) checkCudaErrors(cudaStreamSynchronize(cudaStream));
deviceExit(); return(0);
}
checkCudaErrors(cudaStreamCreate(&cudaStream));
checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap))); int bioem_cuda::deviceInit()
checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap)); {
checkCudaErrors(cudaMalloc(&pConvMap_device, sizeof(bioem_map))); deviceExit();
pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
checkCudaErrors(cudaStreamCreate(&cudaStream));
if (GPUAlgo == 0 || GPUAlgo == 1) checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
{ checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap); for (int i = 0;i < 2;i++)
cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice); {
delete RefMapGPU; checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
} checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
else }
{ pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
} if (GPUAlgo == 0 || GPUAlgo == 1)
deviceInitialized = 1; {
return(0); bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
} cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
delete RefMapGPU;
int bioem_cuda::deviceExit() }
{ else
if (deviceInitialized == 0) return(0); {
cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
cudaStreamDestroy(cudaStream); }
cudaFree(pRefMap_device); deviceInitialized = 1;
cudaFree(pProb_device); return(0);
cudaFree(pConvMap_device); }
cudaThreadExit();
int bioem_cuda::deviceExit()
deviceInitialized = 0; {
return(0); if (deviceInitialized == 0) return(0);
}
cudaStreamDestroy(cudaStream);
int bioem_cuda::deviceStartRun() cudaFree(pRefMap_device);
{ cudaFree(pProb_device);
for (int i = 0;i < 2;i++)
cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice); {
return(0); cudaEventDestroy(cudaEvent[i]);
} cudaFree(pConvMap_device);
}
int bioem_cuda::deviceFinishRun() cudaThreadExit();
{
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost); deviceInitialized = 0;
return(0);
return(0); }
}
int bioem_cuda::deviceStartRun()
bioem* bioem_cuda_create() {
{
return new bioem_cuda; cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
} return(0);
}
int bioem_cuda::deviceFinishRun()
{
if (GPUAsync) cudaStreamSynchronize(cudaStream);
cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
return(0);
}
bioem* bioem_cuda_create()
{
return new bioem_cuda;
}
...@@ -22,12 +22,14 @@ protected: ...@@ -22,12 +22,14 @@ protected:
int deviceInitialized; int deviceInitialized;
cudaStream_t cudaStream; cudaStream_t cudaStream;
cudaEvent_t cudaEvent[2];
bioem_RefMap_Mod* pRefMap_device_Mod; bioem_RefMap_Mod* pRefMap_device_Mod;
bioem_RefMap* pRefMap_device; bioem_RefMap* pRefMap_device;
bioem_Probability* pProb_device; bioem_Probability* pProb_device;
bioem_map* pConvMap_device; bioem_map* pConvMap_device[2];
int GPUAlgo; int GPUAlgo;
int GPUAsync;
}; };
#endif #endif
...@@ -35,6 +35,7 @@ public: ...@@ -35,6 +35,7 @@ public:
bool dumpMap, loadMap; bool dumpMap, loadMap;
__host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(Ref[map].points[x][y]);} __host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(Ref[map].points[x][y]);}
__host__ __device__ inline const myfloat_t* getp(int map, int x, int y) const {return(&Ref[map].points[x][y]);}
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment