Commit 89cc32a4 authored by David Rohr's avatar David Rohr

use host pinned memory for dma transfers

parent 7a018205
......@@ -168,7 +168,7 @@ int bioem::configure(int ac, char* av[])
param.nTotCTFs = atoi(getenv("BIOEM_DEBUG_BREAK"));
}
pProb.init(RefMap.ntotRefMap, param.nTotGridAngles);
pProb.init(RefMap.ntotRefMap, param.nTotGridAngles, *this);
deviceInit();
......@@ -178,7 +178,7 @@ int bioem::configure(int ac, char* av[])
void bioem::cleanup()
{
//Deleting allocated pointers
free(pProb.ptr);
free_device_host(pProb.ptr);
RefMap.freePointers();
}
......@@ -556,3 +556,13 @@ int bioem::deviceFinishRun()
{
return(0);
}
void* bioem::malloc_device_host(size_t size)
{
return(mallocchk(size));
}
void bioem::free_device_host(void* ptr)
{
free(ptr);
}
......@@ -166,7 +166,8 @@ int bioem_cuda::compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map
if (FFTAlgo)
{
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0]));
memcpy(&pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], localmultFFT, param.FFTMapSize * sizeof(mycomplex_t));
checkCudaErrors(cudaMemcpyAsync(&pConvMapFFT[(iConv & 1) * param.FFTMapSize], &pConvMapFFT_Host[(iConv & 1) * param.FFTMapSize], param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice, cudaStream[0]));
if (GPUDualStream)
{
checkCudaErrors(cudaEventRecord(cudaFFTEvent[0], cudaStream[0]));
......@@ -309,6 +310,7 @@ int bioem_cuda::deviceInit()
pFFTtmp2[1] = pFFTtmp2[0] + CUDA_FFTS_AT_ONCE * param.FFTMapSize;
pFFTtmp[1] = pFFTtmp[0] + CUDA_FFTS_AT_ONCE * param.param_device.NumberPixels * param.param_device.NumberPixels;
checkCudaErrors(cudaMalloc(&pConvMapFFT, param.FFTMapSize * sizeof(mycomplex_t) * 2));
checkCudaErrors(cudaHostAlloc(&pConvMapFFT_Host, param.FFTMapSize * sizeof(mycomplex_t) * 2, 0));
checkCudaErrors(cudaMemcpy(pRefMapsFFT, RefMap.RefMapsFFT, RefMap.ntotRefMap * param.FFTMapSize * sizeof(mycomplex_t), cudaMemcpyHostToDevice));
}
......@@ -335,6 +337,7 @@ int bioem_cuda::deviceExit()
{
cudaFree(pRefMapsFFT);
cudaFree(pConvMapFFT);
cudaFreeHost(pConvMapFFT_Host);
cudaFree(pFFTtmp[0]);
cudaFree(pFFTtmp2[0]);
}
......@@ -357,7 +360,7 @@ int bioem_cuda::deviceStartRun()
{
maxRef = GPUWorkload >= 100 ? RefMap.ntotRefMap : ((size_t) RefMap.ntotRefMap * (size_t) GPUWorkload / 100);
cudaMemcpy(pProb_device.ptr, pProb.ptr, pProb.get_size(RefMap.ntotRefMap, param.nTotGridAngles), cudaMemcpyHostToDevice);
cudaMemcpyAsync(pProb_device.ptr, pProb.ptr, pProb.get_size(RefMap.ntotRefMap, param.nTotGridAngles), cudaMemcpyHostToDevice, cudaStream[0]);
if (FFTAlgo)
{
......@@ -391,7 +394,7 @@ int bioem_cuda::deviceStartRun()
int bioem_cuda::deviceFinishRun()
{
if (GPUAsync) cudaStreamSynchronize(cudaStream[0]);
cudaMemcpy(pProb.ptr, pProb_device.ptr, pProb.get_size(RefMap.ntotRefMap, param.nTotGridAngles), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(pProb.ptr, pProb_device.ptr, pProb.get_size(RefMap.ntotRefMap, param.nTotGridAngles), cudaMemcpyDeviceToHost, cudaStream[0]);
if (FFTAlgo)
{
......@@ -405,6 +408,18 @@ int bioem_cuda::deviceFinishRun()
return(0);
}
void* bioem_cuda::malloc_device_host(size_t size)
{
void* ptr;
checkCudaErrors(cudaHostAlloc(&ptr, size, 0));
return(ptr);
}
void bioem_cuda::free_device_host(void* ptr)
{
cudaFreeHost(ptr);
}
bioem* bioem_cuda_create()
{
return new bioem_cuda;
......
......@@ -29,6 +29,9 @@ public:
virtual int compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
int createProjection(int iMap, mycomplex_t* map);
int calcross_cor(myfloat_t* localmap, myfloat_t& sum, myfloat_t& sumsquare);
void calculateCCFFT(int iMap, int iOrient, int iConv, myfloat_t sumC, myfloat_t sumsquareC, mycomplex_t* localConvFFT, mycomplex_t* localCCT, myfloat_t* lCC);
......
......@@ -18,6 +18,8 @@ public:
virtual ~bioem_cuda();
virtual int compareRefMaps(int iOrient, int iConv, const myfloat_t* conv_map, mycomplex_t* localmultFFT, myfloat_t sumC, myfloat_t sumsquareC, const int startMap = 0);
virtual void* malloc_device_host(size_t size);
virtual void free_device_host(void* ptr);
protected:
virtual int deviceInit();
......@@ -37,6 +39,7 @@ protected:
mycomplex_t* pRefMapsFFT;
mycomplex_t* pConvMapFFT;
mycomplex_t* pConvMapFFT_Host;
mycuComplex_t* pFFTtmp2[2];
myfloat_t* pFFTtmp[2];
cufftHandle plan[2][2];
......@@ -52,3 +55,4 @@ protected:
};
#endif
......@@ -103,13 +103,7 @@ public:
return(maps * (angles * sizeof(bioem_Probability_angle) + sizeof(bioem_Probability_map)));
}
void init(size_t maps, size_t angles)
{
nMaps = maps;
nAngles = angles;
ptr = mallocchk(get_size(maps, angles));
set_pointers();
}
void init(size_t maps, size_t angles, bioem& bio);
void set_pointers()
{
......
......@@ -207,3 +207,11 @@ int bioem_RefMap::precalculate(bioem_param& param, bioem& bio)
return(0);
}
void bioem_Probability::init(size_t maps, size_t angles, bioem& bio)
{
nMaps = maps;
nAngles = angles;
ptr = bio.malloc_device_host(get_size(maps, angles));
set_pointers();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment