Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MPIBP-Hummer
BioEM
Commits
81fab4b2
Commit
81fab4b2
authored
Apr 13, 2014
by
David Rohr
Browse files
split ffts for maps on gpu in smaller steps to save memory
parent
e914db36
Changes
3
Hide whitespace changes
Inline
Side-by-side
bioem_cuda.cu
View file @
81fab4b2
...
...
@@ -77,11 +77,11 @@ __global__ void compareRefMapLoopShifts_kernel(const int iOrient, const int iCon
compareRefMap
<
2
>
(
iRefMap
,
iOrient
,
iConv
,
*
pMap
,
pProb
,
param
,
*
RefMap
,
cent_x
,
cent_y
,
myShift
,
nShifts
*
nShifts
,
myRef
,
threadActive
);
}
__global__
void
multComplexMap
(
const
mycomplex_t
*
convmap
,
const
mycomplex_t
*
refmap
,
mycuComplex_t
*
out
,
const
int
NumberPixelsTotal
,
const
int
MapSize
,
const
int
NumberMaps
)
__global__
void
multComplexMap
(
const
mycomplex_t
*
convmap
,
const
mycomplex_t
*
refmap
,
mycuComplex_t
*
out
,
const
int
NumberPixelsTotal
,
const
int
MapSize
,
const
int
NumberMaps
,
const
int
Offset
)
{
if
(
myBlockIdxX
>=
NumberMaps
)
return
;
const
mycomplex_t
*
myin
=
&
refmap
[
myBlockIdxX
*
MapSize
];
mycuComplex_t
*
myout
=
&
out
[
myBlockIdxX
*
MapSize
];
const
mycomplex_t
*
myin
=
&
refmap
[
myBlockIdxX
*
MapSize
+
Offset
];
mycuComplex_t
*
myout
=
&
out
[
(
myBlockIdxX
*
MapSize
)
];
for
(
int
i
=
myThreadIdxX
;
i
<
NumberPixelsTotal
;
i
+=
myBlockDimX
)
{
myout
[
i
].
x
=
convmap
[
i
][
0
]
*
myin
[
i
][
0
]
+
convmap
[
i
][
1
]
*
myin
[
i
][
1
];
...
...
@@ -89,10 +89,10 @@ __global__ void multComplexMap(const mycomplex_t* convmap, const mycomplex_t* re
}
}
__global__
void
cuDoRefMapsFFT
(
const
int
iOrient
,
const
int
iConv
,
const
myfloat_t
*
lCC
,
const
myfloat_t
sumC
,
const
myfloat_t
sumsquareC
,
bioem_Probability
*
pProb
,
const
bioem_param_device
param
,
const
bioem_RefMap
*
RefMap
,
const
int
maxRef
)
__global__
void
cuDoRefMapsFFT
(
const
int
iOrient
,
const
int
iConv
,
const
myfloat_t
*
lCC
,
const
myfloat_t
sumC
,
const
myfloat_t
sumsquareC
,
bioem_Probability
*
pProb
,
const
bioem_param_device
param
,
const
bioem_RefMap
*
RefMap
,
const
int
maxRef
,
const
int
Offset
)
{
const
int
iRefMap
=
myBlockIdxX
*
myBlockDimX
+
myThreadIdxX
;
const
myfloat_t
*
mylCC
=
&
lCC
[
iRefMap
*
param
.
NumberPixels
*
param
.
NumberPixels
];
const
int
iRefMap
=
myBlockIdxX
*
myBlockDimX
+
myThreadIdxX
+
Offset
;
const
myfloat_t
*
mylCC
=
&
lCC
[
(
myBlockIdxX
*
myBlockDimX
+
myThreadIdxX
)
*
param
.
NumberPixels
*
param
.
NumberPixels
];
if
(
iRefMap
>=
maxRef
)
return
;
doRefMapFFT
(
iRefMap
,
iOrient
,
iConv
,
mylCC
,
sumC
,
sumsquareC
,
pProb
,
param
,
*
RefMap
);
}
...
...
@@ -126,14 +126,17 @@ int bioem_cuda::compareRefMaps(int iProjectionOut, int iConv, const bioem_map& c
if
(
FFTAlgo
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
&
pConvMapFFT
[(
iConv
&
1
)
*
param
.
RefMapSize
],
localmultFFT
,
param
.
RefMapSize
*
sizeof
(
mycomplex_t
),
cudaMemcpyHostToDevice
,
cudaStream
));
multComplexMap
<<<
maxRef
,
CUDA_THREAD_COUNT
,
0
,
cudaStream
>>>
(
&
pConvMapFFT
[(
iConv
&
1
)
*
param
.
RefMapSize
],
pRefMapsFFT
,
pFFTtmp2
,
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberFFTPixels1D
,
param
.
RefMapSize
,
maxRef
);
cudaZeroMem
<<<
32
,
256
>>>
(
pFFTtmp
,
maxRef
*
sizeof
(
myfloat_t
)
*
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberPixels
);
if
(
mycufftExecC2R
(
plan
,
pFFTtmp2
,
pFFTtmp
)
!=
CUFFT_SUCCESS
)
for
(
int
i
=
0
;
i
<
maxRef
;
i
+=
CUDA_FFTS_AT_ONCE
)
{
cout
<<
"Error running CUFFT
\n
"
;
exit
(
1
);
const
int
num
=
min
(
CUDA_FFTS_AT_ONCE
,
maxRef
-
i
);
multComplexMap
<<<
num
,
CUDA_THREAD_COUNT
,
0
,
cudaStream
>>>
(
&
pConvMapFFT
[(
iConv
&
1
)
*
param
.
RefMapSize
],
pRefMapsFFT
,
pFFTtmp2
,
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberFFTPixels1D
,
param
.
RefMapSize
,
num
,
i
);
if
(
mycufftExecC2R
(
i
+
CUDA_FFTS_AT_ONCE
>
maxRef
?
plan
[
1
]
:
plan
[
0
],
pFFTtmp2
,
pFFTtmp
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error running CUFFT
\n
"
;
exit
(
1
);
}
cuDoRefMapsFFT
<<<
divup
(
num
,
CUDA_THREAD_COUNT
),
CUDA_THREAD_COUNT
,
0
,
cudaStream
>>>
(
iProjectionOut
,
iConv
,
pFFTtmp
,
sumC
,
sumsquareC
,
pProb_device
,
param
.
param_device
,
pRefMap_device
,
num
,
i
);
}
cuDoRefMapsFFT
<<<
divup
(
maxRef
,
CUDA_THREAD_COUNT
),
CUDA_THREAD_COUNT
,
0
,
cudaStream
>>>
(
iProjectionOut
,
iConv
,
pFFTtmp
,
sumC
,
sumsquareC
,
pProb_device
,
param
.
param_device
,
pRefMap_device
,
maxRef
);
checkCudaErrors
(
cudaGetLastError
());
}
else
...
...
@@ -208,7 +211,9 @@ int bioem_cuda::deviceInit()
if
(
FFTAlgo
)
GPUAlgo
=
2
;
checkCudaErrors
(
cudaStreamCreate
(
&
cudaStream
));
cout
<<
"
\t
Size RefMap
\t\t
"
<<
sizeof
(
bioem_RefMap
)
<<
"
\n
"
;
checkCudaErrors
(
cudaMalloc
(
&
pRefMap_device
,
sizeof
(
bioem_RefMap
)));
cout
<<
"
\t
Size Probability
\t
"
<<
sizeof
(
bioem_Probability
)
*
RefMap
.
ntotRefMap
<<
"
\n
"
;
checkCudaErrors
(
cudaMalloc
(
&
pProb_device
,
sizeof
(
bioem_Probability
)
*
RefMap
.
ntotRefMap
));
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
...
...
@@ -219,28 +224,14 @@ int bioem_cuda::deviceInit()
if
(
FFTAlgo
)
{
cout
<<
"
\t
Size RefMapFFT
\t\t
"
<<
RefMap
.
ntotRefMap
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)
<<
"
\n
"
;
checkCudaErrors
(
cudaMalloc
(
&
pRefMapsFFT
,
RefMap
.
ntotRefMap
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)));
checkCudaErrors
(
cudaMalloc
(
&
pFFTtmp2
,
RefMap
.
ntotRefMap
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)));
checkCudaErrors
(
cudaMalloc
(
&
pFFTtmp
,
RefMap
.
ntotRefMap
*
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberPixels
*
sizeof
(
myfloat_t
)));
cout
<<
"
\t
Size RefMapFFT Copy
\t
"
<<
CUDA_FFTS_AT_ONCE
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)
<<
"
\n
"
;
checkCudaErrors
(
cudaMalloc
(
&
pFFTtmp2
,
CUDA_FFTS_AT_ONCE
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)));
cout
<<
"
\t
Size RefMapFFT Read
\t
"
<<
CUDA_FFTS_AT_ONCE
*
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberPixels
*
sizeof
(
myfloat_t
)
<<
"
\n
"
;
checkCudaErrors
(
cudaMalloc
(
&
pFFTtmp
,
CUDA_FFTS_AT_ONCE
*
param
.
param_device
.
NumberPixels
*
param
.
param_device
.
NumberPixels
*
sizeof
(
myfloat_t
)));
checkCudaErrors
(
cudaMalloc
(
&
pConvMapFFT
,
param
.
RefMapSize
*
sizeof
(
mycomplex_t
)
*
2
));
cudaMemcpy
(
pRefMapsFFT
,
RefMap
.
RefMapsFFT
,
RefMap
.
ntotRefMap
*
param
.
RefMapSize
*
sizeof
(
mycomplex_t
),
cudaMemcpyHostToDevice
);
int
n
[
2
]
=
{
param
.
param_device
.
NumberPixels
,
param
.
param_device
.
NumberPixels
};
if
(
cufftPlanMany
(
&
plan
,
2
,
n
,
NULL
,
1
,
0
,
NULL
,
1
,
0
,
CUFFT_C2R
,
RefMap
.
ntotRefMap
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error planning CUFFT
\n
"
;
exit
(
1
);
}
if
(
cufftSetCompatibilityMode
(
plan
,
CUFFT_COMPATIBILITY_NATIVE
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error planning CUFFT compatibility
\n
"
;
exit
(
1
);
}
if
(
cufftSetStream
(
plan
,
cudaStream
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error setting CUFFT stream
\n
"
;
exit
(
1
);
}
}
if
(
GPUAlgo
==
0
||
GPUAlgo
==
1
)
...
...
@@ -274,9 +265,8 @@ int bioem_cuda::deviceExit()
{
cudaFree
(
pRefMapsFFT
);
cudaFree
(
pConvMapFFT
);
cudaFree
(
pFFTtmp
);
//
cudaFree(pFFTtmp);
cudaFree
(
pFFTtmp2
);
cufftDestroy
(
plan
);
}
cudaThreadExit
();
...
...
@@ -289,6 +279,29 @@ int bioem_cuda::deviceStartRun()
maxRef
=
GPUWorkload
>=
100
?
RefMap
.
ntotRefMap
:
((
size_t
)
RefMap
.
ntotRefMap
*
(
size_t
)
GPUWorkload
/
100
);
cudaMemcpy
(
pProb_device
,
pProb
,
sizeof
(
bioem_Probability
)
*
maxRef
,
cudaMemcpyHostToDevice
);
if
(
FFTAlgo
)
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
int
n
[
2
]
=
{
param
.
param_device
.
NumberPixels
,
param
.
param_device
.
NumberPixels
};
if
(
cufftPlanMany
(
&
plan
[
i
],
2
,
n
,
NULL
,
1
,
0
,
NULL
,
1
,
0
,
CUFFT_C2R
,
i
?
(
maxRef
%
CUDA_FFTS_AT_ONCE
)
:
CUDA_FFTS_AT_ONCE
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error planning CUFFT
\n
"
;
exit
(
1
);
}
if
(
cufftSetCompatibilityMode
(
plan
[
i
],
CUFFT_COMPATIBILITY_NATIVE
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error planning CUFFT compatibility
\n
"
;
exit
(
1
);
}
if
(
cufftSetStream
(
plan
[
i
],
cudaStream
)
!=
CUFFT_SUCCESS
)
{
cout
<<
"Error setting CUFFT stream
\n
"
;
exit
(
1
);
}
}
}
return
(
0
);
}
...
...
@@ -297,6 +310,11 @@ int bioem_cuda::deviceFinishRun()
if
(
GPUAsync
)
cudaStreamSynchronize
(
cudaStream
);
cudaMemcpy
(
pProb
,
pProb_device
,
sizeof
(
bioem_Probability
)
*
maxRef
,
cudaMemcpyDeviceToHost
);
if
(
FFTAlgo
)
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
cufftDestroy
(
plan
[
i
]);
}
return
(
0
);
}
...
...
include/bioem_cuda_internal.h
View file @
81fab4b2
...
...
@@ -38,7 +38,7 @@ protected:
mycomplex_t
*
pConvMapFFT
;
mycuComplex_t
*
pFFTtmp2
;
myfloat_t
*
pFFTtmp
;
cufftHandle
plan
;
cufftHandle
plan
[
2
]
;
int
GPUAlgo
;
//GPU Algorithm to use, 0: parallelize over maps, 1: as 0 but work split in multiple kernels (better), 2: also parallelize over shifts (best)
int
GPUAsync
;
//Run GPU Asynchronously, do the convolutions on the host in parallel.
...
...
include/defs.h
View file @
81fab4b2
...
...
@@ -40,7 +40,7 @@ typedef myfloat_t mycomplex_t[2];
#define BIOEM_MAP_SIZE_X 224
#define BIOEM_MAP_SIZE_Y 224
#define BIOEM_MODEL_SIZE 120000
#define BIOEM_MAX_MAPS
4
000
#define BIOEM_MAX_MAPS
8
000
#define MAX_ORIENT 20000
struct
myfloat3_t
...
...
@@ -70,5 +70,6 @@ struct myfloat3_t
#define CUDA_THREAD_COUNT 256
#define CUDA_BLOCK_COUNT 1024 * 16
#define CUDA_MAX_SHIFT_REDUCE 1024
#define CUDA_FFTS_AT_ONCE 1024
#endif
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment