From 6532f7f3ac3cc8ff92b954f1330c0761dc9dd0ad Mon Sep 17 00:00:00 2001
From: qon <qon@jwdt.org>
Date: Sun, 6 Apr 2014 22:17:03 +0200
Subject: [PATCH] Add Option for asnychronous GPU processing, add test
 implementation with explicit SSE for CPU (currently disabled)

---
 bioem.cpp                     |  11 +-
 bioem_algorithm.h             |  48 ++++-
 bioem_cuda.cu                 | 377 ++++++++++++++++++----------------
 include/bioem_cuda_internal.h |   4 +-
 include/map.h                 |   1 +
 5 files changed, 254 insertions(+), 187 deletions(-)

diff --git a/bioem.cpp b/bioem.cpp
index b917bb9..a552886 100644
--- a/bioem.cpp
+++ b/bioem.cpp
@@ -266,8 +266,15 @@ int bioem::run()
             /*** Comparing each calculated convoluted map with all experimental maps ***/
 			timer.ResetStart();
 			compareRefMaps(iProjectionOut, iConv, conv_map);
-			printf("Time Comparison %d %d: %f\n", iProjectionOut, iConv, timer.GetCurrentElapsedTime());
-
+			const double compTime = timer.GetCurrentElapsedTime();
+			const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
+			const double nFlops = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
+				(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 5. + 25.) / compTime;
+			const double nGBs = (double) RefMap.ntotRefMap * (double) nShifts * (double) nShifts *
+				(((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * ((double) param.param_device.NumberPixels - (double) param.param_device.maxDisplaceCenter / 2.) * 2. + 8.) * (double) sizeof(myfloat_t) / compTime;
+			const double nGBs2 = (double) RefMap.ntotRefMap * ((double) param.param_device.NumberPixels * (double) param.param_device.NumberPixels + 8.) * (double) sizeof(myfloat_t) / compTime;
+
+			printf("Time Comparison %d %d: %f sec (%f GFlops, %f GB/s (cached), %f GB/s)\n", iProjectionOut, iConv, compTime, nFlops / 1000000000., nGBs / 1000000000., nGBs2 / 1000000000.);
         }
 
     }
diff --git a/bioem_algorithm.h b/bioem_algorithm.h
index 43b5270..0ecf852 100644
--- a/bioem_algorithm.h
+++ b/bioem_algorithm.h
@@ -1,5 +1,15 @@
 #ifndef BIOEM_ALGORITHM_H
 #define BIOEM_ALGORITHM_H
+//#include <boost/iterator/iterator_concepts.hpp>
+
+#ifndef BIOEM_GPUCODE
+//#define SSECODE //Explicit SSE code, not correct yet since loop counter is assumed multiple of 4, anyway not faster than autovectorized code.
+#endif
+
+#ifdef SSECODE
+#include <emmintrin.h>
+#include <smmintrin.h>
+#endif
 
 template <int GPUAlgo, class RefT>
 __device__ static inline void compareRefMap(const int iRefMap, const int iOrient, const int iConv, const bioem_map& Mapconv, bioem_Probability* pProb, const bioem_param_device& param, const RefT& RefMap,
@@ -12,10 +22,14 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
     // Taking into account the center displacement
 
     /*** Inizialzing crosscorrelations of calculated projected convolutions ***/
-    myfloat_t sum=0.0;
+#ifdef SSECODE
+	myfloat_t sum, sumsquare, crossproMapConv;
+	__m128 sum_v = _mm_setzero_ps(), sumsquare_v = _mm_setzero_ps(), cross_v = _mm_setzero_ps(), d1, d2;
+#else
+	myfloat_t sum=0.0;
     myfloat_t sumsquare=0.0;
     myfloat_t crossproMapConv=0.0;
-
+#endif
     /****** Loop over Pixels to calculate dot product and cross-correlations of displaced Ref Conv. Map***/
 	if (GPUAlgo != 2 || iRefMap < RefMap.ntotRefMap) 
 	{
@@ -43,9 +57,25 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
 
 		for (int i = iStart; i < iEnd; i += 1)
 		{
+#ifdef SSECODE
+			const float* ptr1 = &Mapconv.points[i + cent_x][jStart + cent_y];
+			const float* ptr2 = RefMap.getp(iRefMap, i, jStart);
+			int j;
+			const int count = jEnd - jStart;
+			for (j = 0;j <= count - 4;j += 4)
+			{
+				d1 = _mm_loadu_ps(ptr1);
+				d2 = _mm_loadu_ps(ptr2);
+				sum_v = _mm_add_ps(sum_v, d1);
+				sumsquare_v = _mm_add_ps(sumsquare_v, _mm_mul_ps(d1, d1));
+				cross_v = _mm_add_ps(cross_v, _mm_mul_ps(d1, d2));
+				ptr1 += 4;
+				ptr2 += 4;
+			}
+#else
 			for (int j = jStart; j < jEnd; j += 1)
 			{
-				const myfloat_t pointMap = Mapconv.points[i+cent_x][j+cent_y];
+				const myfloat_t pointMap = Mapconv.points[i + cent_x][j + cent_y];
 				const myfloat_t pointRefMap = RefMap.get(iRefMap, i, j);
 				crossproMapConv += pointMap * pointRefMap;
 				// Crosscorrelation of calculated displaced map
@@ -53,7 +83,19 @@ __device__ static inline void compareRefMap(const int iRefMap, const int iOrient
 				// Calculate Sum of pixels squared
 				sumsquare += pointMap*pointMap;
 			}
+#endif
 		}
+#ifdef SSECODE
+		sum_v = _mm_hadd_ps(sum_v, sum_v);
+		sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
+		cross_v = _mm_hadd_ps(cross_v, cross_v);
+		sum_v = _mm_hadd_ps(sum_v, sum_v);
+		sumsquare_v = _mm_hadd_ps(sumsquare_v, sumsquare_v);
+		cross_v = _mm_hadd_ps(cross_v, cross_v);
+		sum = _mm_cvtss_f32(sum_v);
+		sumsquare = _mm_cvtss_f32(sumsquare_v);
+		crossproMapConv = _mm_cvtss_f32(cross_v);
+#endif
 	}
 	
     /********** Calculating elements in BioEM Probability formula ********/
diff --git a/bioem_cuda.cu b/bioem_cuda.cu
index 8ba1cb1..e890c62 100644
--- a/bioem_cuda.cu
+++ b/bioem_cuda.cu
@@ -1,189 +1,204 @@
-#define BIOEM_GPUCODE
-
+#define BIOEM_GPUCODE
+
 #if defined(_WIN32)
 #include <windows.h>
-#endif
-
-#include <iostream>
-using namespace std;
-
-#include "bioem_cuda_internal.h"
-#include "bioem_algorithm.h"
-
-//__constant__ bioem_map gConvMap;
-
-static inline void checkCudaErrors(cudaError_t error)
-{
-	if (error != cudaSuccess)
-	{
-		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
-		exit(1);
-	}
-}
-
-bioem_cuda::bioem_cuda()
-{
-	deviceInitialized = 0;
-	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
-}
-
-bioem_cuda::~bioem_cuda()
-{
-	deviceExit();
-}
-
-__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
-{
-	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
-	if (iRefMap < RefMap->ntotRefMap)
-	{
-		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
-	}
-}
-
-__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
-{
-	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
-	if (iRefMap < RefMap->ntotRefMap)
-	{
-		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
-	}
-}
-
-__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
-{
-	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
-	const int iRefMap = myid >> (nShiftBits << 1);
-	const int myRef = myThreadIdxX >> (nShiftBits << 1);
-	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
-	const int myShiftIdy = myid & (nShifts - 1);
-	const int myShift = myid & (nShifts * nShifts - 1);
-	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
-	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
-	
-	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
-}
-
-template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
-static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
+#endif
+
+#include <iostream>
+using namespace std;
+
+#include "bioem_cuda_internal.h"
+#include "bioem_algorithm.h"
+
+//__constant__ bioem_map gConvMap;
+
+static inline void checkCudaErrors(cudaError_t error)
+{
+	if (error != cudaSuccess)
+	{
+		printf("CUDA Error %d / %s\n", error, cudaGetErrorString(error));
+		exit(1);
+	}
+}
+
+bioem_cuda::bioem_cuda()
+{
+	deviceInitialized = 0;
+	GPUAlgo = getenv("GPUALGO") == NULL ? 2 : atoi(getenv("GPUALGO"));
+	GPUAsync = getenv("GPUASYNC") == NULL ? 0 : atoi(getenv("GPUASYNC"));
+}
+
+bioem_cuda::~bioem_cuda()
+{
+	deviceExit();
+}
+
+__global__ void compareRefMap_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap, const int cent_x, const int cent_y)
+{
+	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
+	if (iRefMap < RefMap->ntotRefMap)
+	{
+		compareRefMap<0>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y);
+	}
+}
+
+__global__ void compareRefMapShifted_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap_Mod* RefMap)
+{
+	const int iRefMap = myBlockIdxX * myBlockDimX + myThreadIdxX;
+	if (iRefMap < RefMap->ntotRefMap)
+	{
+		compareRefMapShifted<1>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap);
+	}
+}
+
+__global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iConv, const bioem_map* pMap, bioem_Probability* pProb, const bioem_param_device param, const bioem_RefMap* RefMap, const int blockoffset, const int nShifts, const int nShiftBits)
+{
+	const size_t myid = (myBlockIdxX + blockoffset) * myBlockDimX + myThreadIdxX;
+	const int iRefMap = myid >> (nShiftBits << 1);
+	const int myRef = myThreadIdxX >> (nShiftBits << 1);
+	const int myShiftIdx = (myid >> nShiftBits) & (nShifts - 1);
+	const int myShiftIdy = myid & (nShifts - 1);
+	const int myShift = myid & (nShifts * nShifts - 1);
+	const int cent_x = myShiftIdx * param.GridSpaceCenter - param.maxDisplaceCenter;
+	const int cent_y = myShiftIdy * param.GridSpaceCenter - param.maxDisplaceCenter;
+	
+	compareRefMap<2>(iRefMap,iOrient,iConv,*pMap, pProb, param, *RefMap, cent_x, cent_y, myShift, nShifts * nShifts, myRef);
+}
+
+template <class T> static inline T divup(T num, T divider) {return((num + divider - 1) / divider);}
+static inline bool IsPowerOf2(int x) {return ((x > 0) && ((x & (x - 1)) == 0));}
 #if defined(_WIN32)
 static inline int ilog2 (int value)
 {
 	DWORD index;
 	_BitScanReverse (&index, value);
 	return(value);
-}
-#else
-static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
-#endif
-
-int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
-{
-	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
-	checkCudaErrors(cudaMemcpyAsync(pConvMap_device, &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
-	
-	if (GPUAlgo == 2) //Loop over shifts
-	{
-		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
-		if (!IsPowerOf2(nShifts))
-		{
-			cout << "Invalid number of displacements, no power of two\n";
-			exit(1);
-		}
-		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
-		{
-			cout << "CUDA Thread count is no multiple of number of shifts\n";
-			exit(1);
-		}
-		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
-		{
-			cout << "Too many displacements for CUDA reduction\n";
-			exit(1);
-		}
-		const int nShiftBits = ilog2(nShifts);
-		size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
-		size_t nBlocks = CUDA_BLOCK_COUNT;
-		for (size_t i = 0;i < totalBlocks;i += nBlocks)
-		{
-			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
-		}
-	}
-	else if (GPUAlgo == 1) //Split shifts in multiple kernels
-	{
-		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
-		{
-			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
-			{
-				compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
-			}
-		}	
-	}
-	else if (GPUAlgo == 0) //All shifts in one kernel
-	{
-		compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device, pProb_device, param.param_device, pRefMap_device_Mod);
-	}
-	else
-	{
-		cout << "Invalid GPU Algorithm selected\n";
-		exit(1);
-	}
-	checkCudaErrors(cudaStreamSynchronize(cudaStream));
-	return(0);
-}
-
-int bioem_cuda::deviceInit()
-{
-	deviceExit();
-	
-	checkCudaErrors(cudaStreamCreate(&cudaStream));
-	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
-	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
-	checkCudaErrors(cudaMalloc(&pConvMap_device, sizeof(bioem_map)));
-	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
-	
-	if (GPUAlgo == 0 || GPUAlgo == 1)
-	{
-		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
-		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
-		delete RefMapGPU;
-	}
-	else
-	{
-		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
-	}
-	deviceInitialized = 1;
-	return(0);
-}
-
-int bioem_cuda::deviceExit()
-{
-	if (deviceInitialized == 0) return(0);
-	
-	cudaStreamDestroy(cudaStream);
-	cudaFree(pRefMap_device);
-	cudaFree(pProb_device);
-	cudaFree(pConvMap_device);
-	cudaThreadExit();
-	
-	deviceInitialized = 0;
-	return(0);
-}
-
-int bioem_cuda::deviceStartRun()
-{
-	
-	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
-	return(0);
-}
-
-int bioem_cuda::deviceFinishRun()
-{
-	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
-	
-	return(0);
-}
-
-bioem* bioem_cuda_create()
-{
-	return new bioem_cuda;
-}
+}
+#else
+static inline int ilog2(int value) {return 31 - __builtin_clz(value);}
+#endif
+
+int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& conv_map)
+{
+	//cudaMemcpyToSymbolAsync(&gConvMap, &conv_map, sizeof(bioem_map), 0, cudaMemcpyHostToDevice, cudaStream);
+	if (GPUAsync)
+	{
+		checkCudaErrors(cudaEventSynchronize(cudaEvent[iConv & 1]));
+		checkCudaErrors(cudaEventRecord(cudaEvent[iConv & 1], cudaStream));
+	}
+	checkCudaErrors(cudaMemcpyAsync(pConvMap_device[iConv & 1], &conv_map, sizeof(bioem_map), cudaMemcpyHostToDevice, cudaStream));
+	
+	if (GPUAlgo == 2) //Loop over shifts
+	{
+		const int nShifts = 2 * param.param_device.maxDisplaceCenter / param.param_device.GridSpaceCenter + 1;
+		if (!IsPowerOf2(nShifts))
+		{
+			cout << "Invalid number of displacements, no power of two\n";
+			exit(1);
+		}
+		if (CUDA_THREAD_COUNT % (nShifts * nShifts))
+		{
+			cout << "CUDA Thread count (" << CUDA_THREAD_COUNT << ") is no multiple of number of shifts (" << (nShifts * nShifts) << ")\n";
+			exit(1);
+		}
+		if (nShifts > CUDA_MAX_SHIFT_REDUCE)
+		{
+			cout << "Too many displacements for CUDA reduction\n";
+			exit(1);
+		}
+		const int nShiftBits = ilog2(nShifts);
+		size_t totalBlocks = divup((size_t) RefMap.ntotRefMap * (size_t) nShifts * (size_t) nShifts, (size_t) CUDA_THREAD_COUNT);
+		size_t nBlocks = CUDA_BLOCK_COUNT;
+		for (size_t i = 0;i < totalBlocks;i += nBlocks)
+		{
+			compareRefMapLoopShifts_kernel <<<min(nBlocks, totalBlocks - i), CUDA_THREAD_COUNT, (CUDA_THREAD_COUNT * 2 + CUDA_THREAD_COUNT / (nShifts * nShifts) * 4) * sizeof(myfloat_t), cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device, i, nShifts, nShiftBits);
+		}
+	}
+	else if (GPUAlgo == 1) //Split shifts in multiple kernels
+	{
+		for (int cent_x = -param.param_device.maxDisplaceCenter; cent_x <= param.param_device.maxDisplaceCenter; cent_x=cent_x+param.param_device.GridSpaceCenter)
+		{
+			for (int cent_y = -param.param_device.maxDisplaceCenter; cent_y <= param.param_device.maxDisplaceCenter; cent_y=cent_y+param.param_device.GridSpaceCenter)
+			{
+				compareRefMap_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod, cent_x, cent_y);
+			}
+		}	
+	}
+	else if (GPUAlgo == 0) //All shifts in one kernel
+	{
+		compareRefMapShifted_kernel <<<divup(RefMap.ntotRefMap, CUDA_THREAD_COUNT), CUDA_THREAD_COUNT, 0, cudaStream>>> (iProjectionOut, iConv, pConvMap_device[iConv & 1], pProb_device, param.param_device, pRefMap_device_Mod);
+	}
+	else
+	{
+		cout << "Invalid GPU Algorithm selected\n";
+		exit(1);
+	}
+	if (GPUAsync == 0) checkCudaErrors(cudaStreamSynchronize(cudaStream));
+	return(0);
+}
+
+int bioem_cuda::deviceInit()
+{
+	deviceExit();
+	
+	checkCudaErrors(cudaStreamCreate(&cudaStream));
+	checkCudaErrors(cudaMalloc(&pRefMap_device, sizeof(bioem_RefMap)));
+	checkCudaErrors(cudaMalloc(&pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap));
+	for (int i = 0;i < 2;i++)
+	{
+		checkCudaErrors(cudaEventCreate(&cudaEvent[i]));
+		checkCudaErrors(cudaMalloc(&pConvMap_device[i], sizeof(bioem_map)));
+	}
+	pRefMap_device_Mod = (bioem_RefMap_Mod*) pRefMap_device;
+	
+	if (GPUAlgo == 0 || GPUAlgo == 1)
+	{
+		bioem_RefMap_Mod* RefMapGPU = new bioem_RefMap_Mod(RefMap);
+		cudaMemcpy(pRefMap_device_Mod, RefMapGPU, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
+		delete RefMapGPU;
+	}
+	else
+	{
+		cudaMemcpy(pRefMap_device, &RefMap, sizeof(bioem_RefMap), cudaMemcpyHostToDevice);
+	}
+	deviceInitialized = 1;
+	return(0);
+}
+
+int bioem_cuda::deviceExit()
+{
+	if (deviceInitialized == 0) return(0);
+	
+	cudaStreamDestroy(cudaStream);
+	cudaFree(pRefMap_device);
+	cudaFree(pProb_device);
+	for (int i = 0;i < 2;i++)
+	{
+		cudaEventDestroy(cudaEvent[i]);
+		cudaFree(pConvMap_device);
+	}
+	cudaThreadExit();
+	
+	deviceInitialized = 0;
+	return(0);
+}
+
+int bioem_cuda::deviceStartRun()
+{
+	
+	cudaMemcpy(pProb_device, pProb, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyHostToDevice);
+	return(0);
+}
+
+int bioem_cuda::deviceFinishRun()
+{
+	if (GPUAsync) cudaStreamSynchronize(cudaStream);
+	cudaMemcpy(pProb, pProb_device, sizeof(bioem_Probability) * RefMap.ntotRefMap, cudaMemcpyDeviceToHost);
+	
+	return(0);
+}
+
+bioem* bioem_cuda_create()
+{
+	return new bioem_cuda;
+}
diff --git a/include/bioem_cuda_internal.h b/include/bioem_cuda_internal.h
index fa5dbd0..47b4091 100644
--- a/include/bioem_cuda_internal.h
+++ b/include/bioem_cuda_internal.h
@@ -22,12 +22,14 @@ protected:
 	int deviceInitialized;
 	
 	cudaStream_t cudaStream;
+	cudaEvent_t cudaEvent[2];
 	bioem_RefMap_Mod* pRefMap_device_Mod;
 	bioem_RefMap* pRefMap_device;
 	bioem_Probability* pProb_device;
-	bioem_map* pConvMap_device;
+	bioem_map* pConvMap_device[2];
 	
 	int GPUAlgo;
+	int GPUAsync;
 };
 
 #endif
diff --git a/include/map.h b/include/map.h
index a51f235..4c8e5ad 100644
--- a/include/map.h
+++ b/include/map.h
@@ -35,6 +35,7 @@ public:
 	bool dumpMap, loadMap;
 	
 	__host__ __device__ inline myfloat_t get(int map, int x, int y) const {return(Ref[map].points[x][y]);}
+	__host__ __device__ inline const myfloat_t* getp(int map, int x, int y) const {return(&Ref[map].points[x][y]);}
 };
 
 
-- 
GitLab