Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MPIfR-BDG
psrdada_cpp
Commits
15f1eb52
Commit
15f1eb52
authored
May 14, 2020
by
Tobias Winchen
Browse files
Power spectrum with new IO classes
parent
e931dc38
Changes
6
Hide whitespace changes
Inline
Side-by-side
psrdada_cpp/effelsberg/edd/DadaBufferLayout.hpp
View file @
15f1eb52
...
...
@@ -24,12 +24,16 @@ class DadaBufferLayout
public:
// input key of the dadad buffer
// size of spead heaps in bytes
// number of side channels
DadaBufferLayout
(
key_t
input_key
,
size_t
heapSize
,
size_t
nSideChannels
);
key_t
getInputkey
()
const
;
size_t
getBufferSize
()
const
;
// get size of heaps in bytes
size_t
getHeapSize
()
const
;
size_t
getNSideChannels
()
const
;
...
...
psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh
View file @
15f1eb52
...
...
@@ -41,17 +41,30 @@ typedef float IntegratedPowerType;
struct
InputDataStream
/// Input data and intermediate processing data for one polarization
struct
PolarizationData
{
virtual
void
swap
()
=
0
;
virtual
void
resize
(
size_t
_nchans
,
size_t
_batch
,
const
DadaBufferLayout
&
dadaBufferLayout
)
=
0
;
virtual
void
getFromBlock
(
RawBytes
&
block
,
DadaBufferLayout
&
dadaBufferLayout
,
cudaStream_t
&
_h2d_stream
)
=
0
;
};
DadaBufferLayout
_dadaBufferLayout
;
size_t
_fft_length
;
size_t
_batch
;
// a buffer contains batch * fft_length samples
PolarizationData
(
size_t
fft_length
,
size_t
batch
,
const
DadaBufferLayout
&
dadaBufferLayout
)
:
_fft_length
(
fft_length
),
_batch
(
batch
),
_dadaBufferLayout
(
dadaBufferLayout
)
{
_raw_voltage
.
resize
(
_dadaBufferLayout
.
sizeOfData
()
/
sizeof
(
uint64_t
));
BOOST_LOG_TRIVIAL
(
debug
)
<<
" Input voltages size (in 64-bit words): "
<<
_raw_voltage
.
size
();
_baseLineG0
.
resize
(
1
);
_baseLineG0_update
.
resize
(
1
);
_baseLineG1
.
resize
(
1
);
_baseLineG1_update
.
resize
(
1
);
_channelised_voltage_G0
.
resize
((
_fft_length
/
2
+
1
)
*
_batch
);
_channelised_voltage_G1
.
resize
((
_fft_length
/
2
+
1
)
*
_batch
);
_sideChannelData
.
resize
(
_dadaBufferLayout
.
getNSideChannels
()
*
_dadaBufferLayout
.
getNHeaps
());
BOOST_LOG_TRIVIAL
(
debug
)
<<
" Channelised voltages size: "
<<
_channelised_voltage_G0
.
size
();
};
/// Input data and intermediate processing data for one polarization
struct
PolarizationData
:
public
InputDataStream
{
/// Raw ADC Voltage
DoubleDeviceBuffer
<
RawVoltageType
>
_raw_voltage
;
/// Side channel data
...
...
@@ -79,80 +92,133 @@ struct PolarizationData : public InputDataStream
_sideChannelData
.
swap
();
}
void
resize
(
size_t
_nchans
,
size_t
_batch
,
const
DadaBufferLayout
&
dadaBufferLayout
)
{
_raw_voltage
.
resize
(
dadaBufferLayout
.
sizeOfData
()
/
sizeof
(
uint64_t
));
BOOST_LOG_TRIVIAL
(
debug
)
<<
" Input voltages size (in 64-bit words): "
<<
_raw_voltage
.
size
();
_baseLineG0
.
resize
(
1
);
_baseLineG0_update
.
resize
(
1
);
_baseLineG1
.
resize
(
1
);
_baseLineG1_update
.
resize
(
1
);
_channelised_voltage_G0
.
resize
(
_nchans
*
_batch
);
_channelised_voltage_G1
.
resize
(
_nchans
*
_batch
);
BOOST_LOG_TRIVIAL
(
debug
)
<<
" Channelised voltages size: "
<<
_channelised_voltage_G0
.
size
();
}
void
getFromBlock
(
RawBytes
&
block
,
DadaBufferLayout
&
dadaBufferLayout
,
cudaStream_t
&
_h2d_stream
)
void
getFromBlock
(
RawBytes
&
block
,
cudaStream_t
&
_h2d_stream
)
{
BOOST_LOG_TRIVIAL
(
debug
)
<<
" block.used_bytes() = "
<<
block
.
used_bytes
()
<<
", dataBlockBytes = "
<<
dadaBufferLayout
.
sizeOfData
()
<<
"
\n
"
;
<<
", dataBlockBytes = "
<<
_
dadaBufferLayout
.
sizeOfData
()
<<
"
\n
"
;
CUDA_ERROR_CHECK
(
cudaMemcpyAsync
(
static_cast
<
void
*>
(
_raw_voltage
.
a_ptr
()),
static_cast
<
void
*>
(
block
.
ptr
()),
dadaBufferLayout
.
sizeOfData
()
,
cudaMemcpyHostToDevice
,
_
dadaBufferLayout
.
sizeOfData
()
,
cudaMemcpyHostToDevice
,
_h2d_stream
));
CUDA_ERROR_CHECK
(
cudaMemcpyAsync
(
static_cast
<
void
*>
(
_sideChannelData
.
a_ptr
()),
static_cast
<
void
*>
(
block
.
ptr
()
+
dadaBufferLayout
.
sizeOfData
()
+
dadaBufferLayout
.
sizeOfGap
()),
dadaBufferLayout
.
sizeOfSideChannelData
(),
cudaMemcpyHostToDevice
,
_h2d_stream
));
static_cast
<
void
*>
(
block
.
ptr
()
+
_
dadaBufferLayout
.
sizeOfData
()
+
_
dadaBufferLayout
.
sizeOfGap
()),
_
dadaBufferLayout
.
sizeOfSideChannelData
(),
cudaMemcpyHostToDevice
,
_h2d_stream
));
BOOST_LOG_TRIVIAL
(
debug
)
<<
"First side channel item: 0x"
<<
std
::
setw
(
16
)
<<
std
::
setfill
(
'0'
)
<<
std
::
hex
<<
(
reinterpret_cast
<
uint64_t
*>
(
block
.
ptr
()
+
dadaBufferLayout
.
sizeOfData
()
+
dadaBufferLayout
.
sizeOfGap
()))[
0
]
<<
(
reinterpret_cast
<
uint64_t
*>
(
block
.
ptr
()
+
_
dadaBufferLayout
.
sizeOfData
()
+
_
dadaBufferLayout
.
sizeOfGap
()))[
0
]
<<
std
::
dec
;
}
};
};
/// Input data and intermediate processing data for two polarizations
struct
DualPolarizationData
{
DadaBufferLayout
_dadaBufferLayout
;
DualPolarizationData
(
size_t
fft_length
,
size_t
batch
,
const
DadaBufferLayout
&
dadaBufferLayout
)
:
polarization0
(
fft_length
,
batch
,
dadaBufferLayout
),
polarization1
(
fft_length
,
batch
,
dadaBufferLayout
),
_dadaBufferLayout
(
dadaBufferLayout
)
{
};
//struct DualPolarizationData : public InputDataStream
//{
// PolarizationData pol0, pol1;
// void swap()
// {
// pol0.swap(); pol1.swap();
// }
//
// void resize(size_t _nchans, size_t _batch, const DadaBufferLayout &dadaBufferLayout)
// {
// BOOST_LOG_TRIVIAL(debug) << " Pol0";
// pol0.resize(_nchans, _batch, dadaBufferLayout);
// BOOST_LOG_TRIVIAL(debug) << " Pol1";
// pol1.resize(_nchans, _batch, dadaBufferLayout);
// }
//};
PolarizationData
polarization0
,
polarization1
;
void
swap
()
{
polarization0
.
swap
();
polarization1
.
swap
();
}
void
getFromBlock
(
RawBytes
&
block
,
cudaStream_t
&
_h2d_stream
)
{
// Copy the data with stride to the GPU:
// CPU: P1P2P1P2P1P2 ...
// GPU: P1P1P1 ... P2P2P2 ...
// If this is a bottleneck the gating kernel could sort the layout out
// during copy
int
heapsize_bytes
=
_dadaBufferLayout
.
getHeapSize
();
CUDA_ERROR_CHECK
(
cudaMemcpy2DAsync
(
static_cast
<
void
*>
(
polarization0
.
_raw_voltage
.
a_ptr
()),
heapsize_bytes
,
static_cast
<
void
*>
(
block
.
ptr
()),
2
*
heapsize_bytes
,
heapsize_bytes
,
_dadaBufferLayout
.
sizeOfData
()
/
heapsize_bytes
/
2
,
cudaMemcpyHostToDevice
,
_h2d_stream
));
CUDA_ERROR_CHECK
(
cudaMemcpy2DAsync
(
static_cast
<
void
*>
(
polarization1
.
_raw_voltage
.
a_ptr
()),
heapsize_bytes
,
static_cast
<
void
*>
(
block
.
ptr
())
+
heapsize_bytes
,
2
*
heapsize_bytes
,
heapsize_bytes
,
_dadaBufferLayout
.
sizeOfData
()
/
heapsize_bytes
/
2
,
cudaMemcpyHostToDevice
,
_h2d_stream
));
CUDA_ERROR_CHECK
(
cudaMemcpy2DAsync
(
static_cast
<
void
*>
(
polarization0
.
_sideChannelData
.
a_ptr
()),
sizeof
(
uint64_t
),
static_cast
<
void
*>
(
block
.
ptr
()
+
_dadaBufferLayout
.
sizeOfData
()
+
_dadaBufferLayout
.
sizeOfGap
()),
2
*
sizeof
(
uint64_t
),
sizeof
(
uint64_t
),
_dadaBufferLayout
.
sizeOfSideChannelData
()
/
2
/
sizeof
(
uint64_t
),
cudaMemcpyHostToDevice
,
_h2d_stream
));
CUDA_ERROR_CHECK
(
cudaMemcpy2DAsync
(
static_cast
<
void
*>
(
polarization1
.
_sideChannelData
.
a_ptr
()),
sizeof
(
uint64_t
),
static_cast
<
void
*>
(
block
.
ptr
()
+
_dadaBufferLayout
.
sizeOfData
()
+
_dadaBufferLayout
.
sizeOfGap
()
+
sizeof
(
uint64_t
)),
2
*
sizeof
(
uint64_t
),
sizeof
(
uint64_t
),
_dadaBufferLayout
.
sizeOfSideChannelData
()
/
2
/
sizeof
(
uint64_t
),
cudaMemcpyHostToDevice
,
_h2d_stream
));
BOOST_LOG_TRIVIAL
(
debug
)
<<
"First side channel item: 0x"
<<
std
::
setw
(
16
)
<<
std
::
setfill
(
'0'
)
<<
std
::
hex
<<
(
reinterpret_cast
<
uint64_t
*>
(
block
.
ptr
()
+
_dadaBufferLayout
.
sizeOfData
()
+
_dadaBufferLayout
.
sizeOfGap
()))[
0
]
<<
std
::
dec
;
}
};
//
Output data for one gate N = 1 for one pol, or 4 for full stokes
//
/ INterface for the processed output data stream
struct
OutputDataStream
{
/// Reset outptu for new integration
size_t
_nchans
;
size_t
_blocks
;
OutputDataStream
(
size_t
nchans
,
size_t
blocks
)
:
_nchans
(
nchans
),
_blocks
(
blocks
)
{
}
/// Reset output to for new integration
virtual
void
reset
(
cudaStream_t
&
_proc_stream
)
=
0
;
/// Swap output buffers
virtual
void
swap
()
=
0
;
virtual
void
resize
(
size_t
size
,
size_t
blocks
)
=
0
;
// output buffer on the host
DoublePinnedHostBuffer
<
char
>
_host_power
;
// copy data from internal buffers of the implementation to the host output
// buffer
virtual
void
data2Host
(
cudaStream_t
&
_d2h_stream
)
=
0
;
};
// Output data for one gate, single power spectrum
struct
PowerSpectrumOutput
{
PowerSpectrumOutput
(
size_t
size
,
size_t
blocks
)
{
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Setting size of power spectrum output size = "
<<
size
<<
", blocks = "
<<
blocks
;
data
.
resize
(
size
*
blocks
);
_noOfBitSets
.
resize
(
blocks
);
}
/// spectrum data
DoubleDeviceBuffer
<
IntegratedPowerType
>
data
;
...
...
@@ -162,7 +228,7 @@ struct PowerSpectrumOutput
/// Reset outptu for new integration
void
reset
(
cudaStream_t
&
_proc_stream
)
{
thrust
::
fill
(
thrust
::
cuda
::
par
.
on
(
_proc_stream
),
data
.
a
().
begin
(),
data
.
a
().
end
(),
0.
);
thrust
::
fill
(
thrust
::
cuda
::
par
.
on
(
_proc_stream
),
data
.
a
().
begin
(),
data
.
a
().
end
(),
0.
);
thrust
::
fill
(
thrust
::
cuda
::
par
.
on
(
_proc_stream
),
_noOfBitSets
.
a
().
begin
(),
_noOfBitSets
.
a
().
end
(),
0L
);
}
...
...
@@ -172,20 +238,21 @@ struct PowerSpectrumOutput
data
.
swap
();
_noOfBitSets
.
swap
();
}
/// Resize all buffers
void
resize
(
size_t
size
,
size_t
blocks
)
{
data
.
resize
(
size
*
blocks
);
_noOfBitSets
.
resize
(
blocks
);
}
};
struct
GatedPowerSpectrumOutput
:
public
OutputDataStream
{
GatedPowerSpectrumOutput
(
size_t
nchans
,
size_t
blocks
)
:
OutputDataStream
(
nchans
,
blocks
),
G0
(
nchans
,
blocks
),
G1
(
nchans
,
blocks
)
{
// on the host both power are stored in the same data buffer together with
// the number of bit sets
_host_power
.
resize
(
2
*
(
_nchans
*
sizeof
(
IntegratedPowerType
)
+
sizeof
(
size_t
)
)
*
G0
.
_noOfBitSets
.
size
());
}
PowerSpectrumOutput
G0
,
G1
;
size_t
_nchans
;
void
reset
(
cudaStream_t
&
_proc_stream
)
{
...
...
@@ -201,19 +268,6 @@ struct GatedPowerSpectrumOutput : public OutputDataStream
_host_power
.
swap
();
}
/// Resize all buffers
void
resize
(
size_t
size
,
size_t
blocks
)
{
// ToDo: size passed in constructor, also number of blocks.
G0
.
resize
(
size
,
blocks
);
G1
.
resize
(
size
,
blocks
);
_nchans
=
size
;
// on the host both power are stored in the same data buffer together with
// the number of bit sets
_host_power
.
resize
(
2
*
(
size
*
sizeof
(
IntegratedPowerType
)
+
sizeof
(
size_t
)
)
*
G0
.
_noOfBitSets
.
size
());
}
void
data2Host
(
cudaStream_t
&
_d2h_stream
)
{
// copy data to host if block is finished
...
...
@@ -301,9 +355,11 @@ struct FullStokesOutput : public OutputDataStream
}
};
/*
struct GatedFullStokesOutput: public OutputDataStream
{
FullStokesOutput G0, G1;
void reset(cudaStream_t &_proc_stream)
{
...
...
@@ -324,8 +380,112 @@ struct GatedFullStokesOutput: public OutputDataStream
G0.resize(size, blocks);
G1.resize(size, blocks);
}
};
void data2Host(cudaStream_t &_d2h_stream)
{
for (size_t i = 0; i < G0._noOfBitSets.size(); i++)
{
size_t memslicesize = (_nchans * sizeof(IntegratedPowerType));
size_t memOffset = 8 * i * (memslicesize + + sizeof(size_t));
// Copy II QQ UU VV
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset) ,
static_cast<void *>(G0.I.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 1 * memslicesize) ,
static_cast<void *>(G1.I.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 2 * memslicesize) ,
static_cast<void *>(G0.Q.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 3 * memslicesize) ,
static_cast<void *>(G1.Q.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 4 * memslicesize) ,
static_cast<void *>(G0.U.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 5 * memslicesize) ,
static_cast<void *>(G1.U.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 6 * memslicesize) ,
static_cast<void *>(G0.V.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync(static_cast<void *>(_host_power_db.a_ptr() + memOffset + 7 * memslicesize) ,
static_cast<void *>(G1.V.b_ptr() + i * memslicesize),
_nchans * sizeof(IntegratedPowerType),
cudaMemcpyDeviceToHost, _d2h_stream));
// Copy SCI
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize),
static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 1 * sizeof(size_t)),
static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 2 * sizeof(size_t)),
static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 3 * sizeof(size_t)),
static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 4 * sizeof(size_t)),
static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 5 * sizeof(size_t)),
static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 6 * sizeof(size_t)),
static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
CUDA_ERROR_CHECK(
cudaMemcpyAsync( static_cast<void *>(_host_power_db.a_ptr() + memOffset + 8 * memslicesize + 7 * sizeof(size_t)),
static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
1 * sizeof(size_t),
cudaMemcpyDeviceToHost, _d2h_stream));
}
}
};
*/
...
...
@@ -335,7 +495,10 @@ struct GatedFullStokesOutput: public OutputDataStream
bit set in side channel data.
*/
template
<
class
HandlerType
>
class
GatedSpectrometer
{
template
<
class
HandlerType
,
class
InputType
,
class
OutputType
>
class
GatedSpectrometer
{
public:
/**
* @brief Constructor
...
...
@@ -384,6 +547,15 @@ public:
*/
bool
operator
()(
RawBytes
&
block
);
// Processing strategy for single pol mode
void
process
(
PolarizationData
*
inputDataStream
,
GatedPowerSpectrumOutput
*
outputDataStream
);
// Processing strategy for dual pol power mode
//void process(DualPolarizationData*inputDataStream, GatedPowerSpectrumOutput *outputDataStream);
// Processing strategy for full stokes mode
//void process(DualPolarizationData*inputDataStream, FullStokesOutput *outputDataStream);
private:
// gate the data from the input stream and fft data per gate. Number of bit
// sets in every gate is stored in the output datasets
...
...
@@ -408,22 +580,17 @@ private:
uint64_t
_nchans
;
uint64_t
_nBlocks
;
uint64_t
_call_count
;
double
_processing_efficiency
;
std
::
unique_ptr
<
Unpacker
>
_unpacker
;
OutputDataStream
*
outputDataStream
;
InputDataStream
*
inputDataStream
;
std
::
unique_ptr
<
DetectorAccumulator
<
IntegratedPowerType
>
>
_detector
;
OutputType
*
outputDataStream
;
InputType
*
inputDataStream
;
// Temporary processing block
// ToDo: Use inplace FFT to avoid temporary
c
oltage array
// ToDo: Use inplace FFT to avoid temporary
v
oltage array
thrust
::
device_vector
<
UnpackedVoltageType
>
_unpacked_voltage_G0
;
thrust
::
device_vector
<
UnpackedVoltageType
>
_unpacked_voltage_G1
;
cudaStream_t
_h2d_stream
;
cudaStream_t
_proc_stream
;
cudaStream_t
_d2h_stream
;
...
...
psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
View file @
15f1eb52
...
...
@@ -17,7 +17,7 @@ namespace psrdada_cpp {
namespace
effelsberg
{
namespace
edd
{
// Reduce thread local vatiable v in shared array x, so that x[0]
// Reduce thread local vatiable v in shared array x, so that x[0]
contains sum
template
<
typename
T
>
__device__
void
sum_reduce
(
T
*
x
,
const
T
&
v
)
{
...
...
@@ -32,9 +32,10 @@ __device__ void sum_reduce(T *x, const T &v)
}
// If one of the side channel items is l
s
ot, then both are considered as lost
// If one of the side channel items is lo
s
t, then both are considered as lost
// here
__global__
void
mergeSideChannels
(
uint64_t
*
__restrict__
A
,
uint64_t
*
__restrict__
B
,
size_t
N
)
__global__
void
mergeSideChannels
(
uint64_t
*
__restrict__
A
,
uint64_t
*
__restrict__
B
,
size_t
N
)
{
for
(
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
i
<
N
);
i
+=
blockDim
.
x
*
gridDim
.
x
)
...
...
@@ -146,21 +147,16 @@ __global__ void update_baselines(float* __restrict__ baseLineG0,
template
<
class
HandlerType
>
GatedSpectrometer
<
HandlerType
>::
GatedSpectrometer
(
const
DadaBufferLayout
&
dadaBufferLayout
,
std
::
size_t
selectedSideChannel
,
std
::
size_t
selectedBit
,
std
::
size_t
fft_length
,
std
::
size_t
naccumulate
,
std
::
size_t
nbits
,
float
input_level
,
float
output_level
,
HandlerType
&
handler
)
:
_dadaBufferLayout
(
dadaBufferLayout
),
_selectedSideChannel
(
selectedSideChannel
),
_selectedBit
(
selectedBit
),
_fft_length
(
fft_length
),
_naccumulate
(
naccumulate
),
_nbits
(
nbits
),
_handler
(
handler
),
_fft_plan
(
0
),
_call_count
(
0
),
_nsamps_per_heap
(
4096
),
_processing_efficiency
(
0.
){
template
<
class
HandlerType
,
class
InputType
,
class
OutputType
>
GatedSpectrometer
<
HandlerType
,
InputType
,
OutputType
>::
GatedSpectrometer
(
const
DadaBufferLayout
&
dadaBufferLayout
,
std
::
size_t
selectedSideChannel
,
std
::
size_t
selectedBit
,
std
::
size_t
fft_length
,
std
::
size_t
naccumulate
,
std
::
size_t
nbits
,
float
input_level
,
float
output_level
,
HandlerType
&
handler
)
:
_dadaBufferLayout
(
dadaBufferLayout
),
_selectedSideChannel
(
selectedSideChannel
),
_selectedBit
(
selectedBit
),
_fft_length
(
fft_length
),
_naccumulate
(
naccumulate
),
_nbits
(
nbits
),
_handler
(
handler
),
_fft_plan
(
0
),
_call_count
(
0
),
_nsamps_per_heap
(
4096
)
{
// Sanity checks
assert
(((
_nbits
==
12
)
||
(
_nbits
==
8
)));
...
...
@@ -184,7 +180,6 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
assert
(
selectedBit
<
64
);
// Sanity check of selected bit
_nsamps_per_buffer
=
_dadaBufferLayout
.
sizeOfData
()
*
8
/
nbits
;
_nsamps_per_output_spectra
=
fft_length
*
naccumulate
;
if
(
_nsamps_per_output_spectra
<=
_nsamps_per_buffer
)
{
// one buffer block is used for one or multiple output spectra
...
...
@@ -200,10 +195,12 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
assert
(
N
*
_nsamps_per_buffer
==
_nsamps_per_output_spectra
);
_nBlocks
=
N
;
}
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Integrating "
<<
_nsamps_per_output_spectra
<<
" samples from "
<<
_nBlocks
<<
" into one spectra."
;
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Integrating "
<<
_nsamps_per_output_spectra
<<
" samples from "
<<
_nBlocks
<<
" into one output spectrum."
;
_nchans
=
_fft_length
/
2
+
1
;
int
batch
=
_nsamps_per_buffer
/
_fft_length
;
float
dof
=
2
*
_naccumulate
;
float
scale
=
std
::
pow
(
input_level
*
std
::
sqrt
(
static_cast
<
float
>
(
_nchans
)),
2
);
...
...
@@ -213,27 +210,28 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
<<
"Correction factors for 8-bit conversion: offset = "
<<
offset
<<
", scaling = "
<<
scaling
;
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Generating FFT plan"
;
int
n
[]
=
{
static_cast
<
int
>
(
_fft_length
)};
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Generating FFT plan:
\n
"
<<
" fft_length = "
<<
_fft_length
<<
"
\n
"
<<
" n[0] = "
<<
n
[
0
]
<<
"
\n
"
<<
" _nchans = "
<<
_nchans
<<
"
\n
"
<<
" batch = "
<<
batch
<<
"
\n
"
;
;
CUFFT_ERROR_CHECK
(
cufftPlanMany
(
&
_fft_plan
,
1
,
n
,
NULL
,
1
,
_fft_length
,
NULL
,
1
,
_nchans
,
CUFFT_R2C
,
batch
));
cufftSetStream
(
_fft_plan
,
_proc_stream
);
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Allocating memory"
;
// if singlePol
inputDataStream
=
new
PolarizationData
();
inputDataStream
->
resize
(
_nchans
,
batch
,
_dadaBufferLayout
);
inputDataStream
=
new
InputType
(
fft_length
,
batch
,
_dadaBufferLayout
);
_unpacked_voltage_G0
.
resize
(
_nsamps_per_buffer
);
_unpacked_voltage_G1
.
resize
(
_nsamps_per_buffer
);
BOOST_LOG_TRIVIAL
(
debug
)
<<
" Unpacked voltages size (in samples): "
<<
_unpacked_voltage_G0
.
size
();
outputDataStream
=
new
GatedPowerSpectrumOutput
();
outputDataStream
->
resize
(
_nchans
,
batch
/
(
_naccumulate
/
_nBlocks
));
outputDataStream
=
new
OutputType
(
_nchans
,
batch
/
(
_naccumulate
/
_nBlocks
));
CUDA_ERROR_CHECK
(
cudaStreamCreate
(
&
_h2d_stream
));
CUDA_ERROR_CHECK
(
cudaStreamCreate
(
&
_proc_stream
));
...
...
@@ -241,14 +239,13 @@ GatedSpectrometer<HandlerType>::GatedSpectrometer(
CUFFT_ERROR_CHECK
(
cufftSetStream
(
_fft_plan
,
_proc_stream
));
_unpacker
.
reset
(
new
Unpacker
(
_proc_stream
));
_detector
.
reset
(
new
DetectorAccumulator
<
IntegratedPowerType
>
(
_nchans
,
_naccumulate
/
_nBlocks
,
scaling
,
offset
,
_proc_stream
));
}
// constructor
template
<
class
HandlerType
>
GatedSpectrometer
<
HandlerType
>::~
GatedSpectrometer
()
{
template
<
class
HandlerType
,
class
InputType
,
class
OutputType
>
GatedSpectrometer
<
HandlerType
,
InputType
,
OutputType
>::~
GatedSpectrometer
()
{
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Destroying GatedSpectrometer"
;
cudaDeviceSynchronize
();
if
(
!
_fft_plan
)
cufftDestroy
(
_fft_plan
);
cudaStreamDestroy
(
_h2d_stream
);
...
...
@@ -257,14 +254,14 @@ GatedSpectrometer<HandlerType>::~GatedSpectrometer() {
}