Skip to content
Snippets Groups Projects
Commit c8b3d3b6 authored by Niclas Esser's avatar Niclas Esser
Browse files

Merged cryopaf into devel

parent 6a287b3c
No related branches found
No related tags found
No related merge requests found
Pipeline #100153 passed
Showing
with 3294 additions and 0 deletions
......@@ -88,3 +88,4 @@ install(DIRECTORY detail DESTINATION include/psrdada_cpp)
add_subdirectory(meerkat)
add_subdirectory(effelsberg)
add_subdirectory(cryopaf)
/*
* Beamformer.cuh
* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
* Description:
* This file consists of a single class (Beamformer<ComputeType>). An object of Beamformer
* can be used o either perform a Stokes I detection or raw voltage beamforming
* on a GPU.
* Both beamforming kernels expect the same dataproduct (linear aligned in device memory)
* Input: F-P-T-E
* Weight: F-P-B-E
* Output: F-T-B-P (voltage beams)
* Output: F-T-B (Stokes I beams)
*/
#ifndef BEAMFORMER_CUH_
#define BEAMFORMER_CUH_
#include <cuda.h>
#include <cuda_fp16.h>
#include <thrust/device_vector.h>
#include "psrdada_cpp/cuda_utils.hpp"
#include "psrdada_cpp/multilog.hpp"
namespace psrdada_cpp{
namespace cryopaf{
// Constants for beamform kernels
#define NTHREAD 1024
#define TILE_SIZE 32
#define WARP_SIZE 32
/**
* @brief GPU kernel to perform Stokes I detection beamforming
*
* @detail Template type T has to be etiher T=float2 or T=__half2.
* According to T, U has to be either U=float or U=__half
*
* @param T* idata pointer to input memory (format: F-P-T-E)
* @param T* wdata pointer to beam weight memory (format: F-P-B-E)
* @param U* odata pointer to output memory type of U is equal to T::x (format: F-T-B)
* @param int time Width of time dimension (T)
* @param int elem Number of elements (E)
* @param int beam Number of beams (B)
* @param int integrate Integration time, currently limited to 32 and a power of 2
*
* @TODO: Allow greater integration time
*/
template<typename T, typename U>__global__
void beamformer_power_fpte_fpbe_ftb(
const T *idata,
const T* wdata,
U *odata,
int time,
int elem,
int beam,
int integrate);
/**
* @brief GPU kernel to perform raw voltage beamforming
*
* @detail Template type T has to be etiher T=float2 or T=__half2
*
* @param T* idata pointer to input memory (format: F-P-T-E)
* @param T* wdata pointer to beam weight memory (format: F-P-B-E)
* @param T* odata pointer to output memory (format: F-T-B)
* @param int time Width of time dimension (T)
* @param int elem Number of elements (E)
* @param int beam Number of beams (B)
*/
template<typename T>__global__
void beamformer_voltage_fpte_fpbe_fptb(
const T *idata,
const T* wdata,
T *odata,
int time,
int elem,
int beam);
template<class ComputeType>
class Beamformer{
// Internal typedefintions
private:
typedef decltype(ComputeType::x) ResultType; // Just necessary for Stokes I beamformer
// Public functions
public:
/**
* @brief constructs an object of Beamformer<ComputeType> (ComputeType=float2 or ComputeType=__half2)
*
* @param cudaStream_t& stream Object of cudaStream_t to allow parallel copy + processing (has to be created and destroyed elsewhere)
* @param std::size_t sample Number of samples to process in on kernel launch (no restrictions)
* @param std::size_t channel Number of channels to process in on kernel launch (no restrictions)
* @param std::size_t element Number of elements to process in on kernel launch (no restrictions)
* @param std::size_t beam Number of beams to process in on kernel launch (no restrictions)
* @param std::size_t integration Samples to be integrated, has to be power of 2 and smaller 32
*/
Beamformer(
cudaStream_t& stream,
std::size_t sample,
std::size_t channel,
std::size_t element,
std::size_t beam,
std::size_t integration = 1);
/**
* @brief deconstructs an object of Beamformer<ComputeType> (ComputeType=float2 or ComputeType=__half2)
*/
~Beamformer();
/**
* @brief Launches voltage beamforming GPU kernel
*
* @param ComputeType* input pointer to input memory (format: F-P-T-E)
* @param ComputeType* weights pointer to beam weight memory (format: F-P-B-E)
* @param ComputeType* output pointer to output memory (format: F-T-B-P)
*/
void process(
const ComputeType* input,
const ComputeType* weights,
ComputeType* output);
/**
* @brief Launches Stokes I beamforming GPU kernel
*
* @param ComputeType* input pointer to input memory (format: F-P-T-E)
* @param ComputeType* weights pointer to beam weight memory (format: F-P-B-E)
* @param ResultType* output pointer to output memory (format: F-T-B)
*/
void process(
const ComputeType* input,
const ComputeType* weights,
ResultType* output);
/**
* @brief Prints the block and grid layout of a kernel (used for debugging purposes)
*/
void print_layout();
// Private attributes
private:
cudaStream_t& _stream;
dim3 grid;
dim3 block;
std::size_t _sample;
std::size_t _channel;
std::size_t _element;
std::size_t _beam;
std::size_t _integration;
};
} // namespace cryopaf
} // namespace psrdada_cpp
#include "psrdada_cpp/cryopaf/details/utils.cu"
#include "psrdada_cpp/cryopaf/details/BeamformerKernels.cu"
#include "psrdada_cpp/cryopaf/src/Beamformer.cu"
#endif /* BEAMFORMER_CUH_ */
/*
* BufferTypes.cuh
* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
* Description:
* This file contains classes for different kinds of buffer representation.
* All implemented classes inherit from DoubleBuffer<thrust::device_vector<T>>.
*/
#ifndef BUFFERTYPES_HPP_
#define BUFFERTYPES_HPP_
// boost::interprocess used to upload weights via POSIX shared memory
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/interprocess/mapped_region.hpp>
#include <boost/interprocess/sync/scoped_lock.hpp>
#include <boost/thread.hpp>
#include "psrdada_cpp/cuda_utils.hpp"
#include "psrdada_cpp/double_device_buffer.cuh"
#include "psrdada_cpp/double_host_buffer.cuh"
#include "psrdada_cpp/cryopaf/QueueHeader.hpp"
namespace psrdada_cpp {
namespace cryopaf{
/**
* @brief Class providing buffers for raw voltage data
*/
template<class T>
class RawVoltage : public DoubleBuffer<thrust::device_vector<T>>
{
public:
typedef T type;
public:
/**
* @brief Instantiates an object of RawVoltage
*
* @param std::size_t Number of items in buffer
*
* @detail Allocates twice the size in device memory as double device buffer
*/
RawVoltage(std::size_t size)
: DoubleBuffer<thrust::device_vector<T>>()
{
this->resize(size);
_bytes = size * sizeof(T);
}
/**
* @brief Destroys an object of RawVoltage
*/
~RawVoltage(){}
/**
* @brief Returns the number of bytes used for a single buffer
*
* @detail The occupied memory is twice
*/
std::size_t total_bytes(){return _bytes;}
private:
std::size_t _bytes;
};
/**
* @brief Class providing buffers for beam data (is always the Output of the Pipeline)
* @detail An object of BeamOutput also contains an instance of DoublePinnedHostBuffer<T>
* to allow an asynchronous copy to the host memory.
*/
template<class T>
class BeamOutput : public DoubleBuffer<thrust::device_vector<T>>
{
public:
typedef T type;
DoublePinnedHostBuffer<T> host;
public:
/**
* @brief Instantiates an object of BeamOutput
*
* @param std::size_t Number of items in buffer
*
* @detail Allocates twice the size in device memory and in host memory as double buffers
*/
BeamOutput(std::size_t size)
: DoubleBuffer<thrust::device_vector<T>>()
{
this->resize(size);
host.resize(size);
_bytes = size * sizeof(T);
}
/**
* @brief Destroys an object of BeamOutput
*/
~BeamOutput(){}
/**
* @brief Asynchronous copy to host memory
*
* @param cudaStream_t& stream Device to host stream
*/
void async_copy(cudaStream_t& stream)
{
CUDA_ERROR_CHECK(cudaMemcpyAsync(host.a_ptr(), this->a_ptr(), _bytes, cudaMemcpyDeviceToHost, stream));
}
/**
* @brief Returns the number of bytes used for a single buffer
*
* @detail The occupied memory is twice
*/
std::size_t total_bytes(){return _bytes;}
private:
std::size_t _bytes;
};
// Define namespace for convinient access to boost::interprocess functionalitys, just used for weights
namespace bip = boost::interprocess;
/**
* @brief Class providing buffers for beam weights
* @detail An object of Weights as the ability to read out a POSIX shared memory namespace
* to load updated beam weights.
* @note The current state is not final and will change in future. The idea for future is
* to provide an update method which is called by a shared memory instance.
*/
template<class T>
class Weights : public DoubleBuffer<thrust::device_vector<T>>
{
public:
typedef T type;
public:
/**
* @brief Instantiates an object of BeamOutput
*
* @param std::size_t Number of items in buffer
* @param std::string Name of the POSIX shared memory
*
* @detail Allocates twice the size in device memory as double device buffer.
* It also launches a boost::thread to create, read and write from shared
* memory.
*/
Weights(std::size_t size, std::string smem_name="SharedMemoryWeights")
: DoubleBuffer<thrust::device_vector<T>>()
, _smem_name(smem_name)
{
this->resize(size);
_bytes = size * sizeof(T);
t = new boost::thread(boost::bind(&Weights::run, this));
}
/**
* @brief Destroys an object of BeamOutput
*/
~Weights(){}
/**
* @brief Creates, read, write and removes a POSIX shared memory space
*
* @detail This function is a temporary solution to update beam weights on-the-fly
* while the pipeline is operating. In the future a clean interface will be created
* that provides addtional monitoring informations (e.g. power level) besides the
* beam weight updating mechanism.
*/
void run()
{
bip::shared_memory_object::remove("SharedMemoryWeights");
bip::shared_memory_object smem(bip::create_only, "SharedMemoryWeights", bip::read_write);
// Set size of shared memory including QueueHeader + payload
BOOST_LOG_TRIVIAL(info) << "Size of shared memory for weight uploading (IPC) " << sizeof(QueueHeader) + (this->size()) * sizeof(T);
smem.truncate(sizeof(QueueHeader) + (this->size()) * sizeof(T));
// Map shared memory to a addressable region
bip::mapped_region region(smem, bip::read_write);
void* smem_addr = region.get_address(); // get it's address
QueueHeader* qheader = static_cast<QueueHeader*>(smem_addr); // Interpret first bytes as QueueHeader
T *ptr = &(static_cast<T*>(smem_addr)[sizeof(QueueHeader)]); // Pointer to address of payload (behind QueueHeader)
qheader->stop = true;
while(qheader->stop){usleep(1000);}
while(!qheader->stop)
{
bip::scoped_lock<bip::interprocess_mutex> lock(qheader->mutex);
if(!qheader->data_in)
{
BOOST_LOG_TRIVIAL(debug) << "Waiting for writing weights to shared memory";
qheader->ready_to_read.wait(lock); // Wait for read out
}
BOOST_LOG_TRIVIAL(debug) << "Reading new weights from shared memory";
CUDA_ERROR_CHECK(cudaMemcpy((void*)this->b_ptr(), (void*)ptr,
_bytes, cudaMemcpyHostToDevice));
// Swap double buffer, so next batch is calculated with new weights
this->swap();
//Notify the other process that the buffer is empty
qheader->data_in = false;
qheader->ready_to_write.notify_all();
}
bip::shared_memory_object::remove("SharedMemoryWeights");
BOOST_LOG_TRIVIAL(info) << "Closed shared memory for weights uploading";
}
std::size_t total_bytes(){return _bytes;}
private:
std::size_t _bytes;
std::string _smem_name;
boost::thread *t;
};
}
}
#endif /* BUFFERTYPES_HPP_ */
if(ENABLE_CUDA)
set(PSRDADA_CPP_CRYOPAF_LIBRARIES
${CMAKE_PROJECT_NAME}
${CMAKE_PROJECT_NAME}_cryopaf
${DEPENDENCY_LIBRARIES}
-lboost_system
-lpthread)
set(psrdada_cpp_cryopaf_src
Unpacker.cuh
Beamformer.cuh
BufferTypes.cuh
Pipeline.cuh
PipelineInterface.cuh)
cuda_add_library(${CMAKE_PROJECT_NAME}_cryopaf ${psrdada_cpp_cryopaf_src})
cuda_add_executable(beamforming src/beamforming_cli.cu)
target_link_libraries(beamforming ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
install(TARGETS beamforming DESTINATION bin)
cuda_add_executable(weightupdater src/weightupdater_cli.cu)
target_link_libraries(weightupdater ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
install(TARGETS weightupdater DESTINATION bin)
add_subdirectory(profiling)
add_subdirectory(test)
endif(ENABLE_CUDA)
/*
* Pipeline.cuh
* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
* Description:
* This files consists of a single class (Pipeline<HandlerType, ComputeType, ResultType>)
* and a configuration structure (PipelineConfig).
* An object of Pipeline is used to access data from psrdada buffers, unpacks them,
* performs beamforming and writes the results back to another psrdada buffer.
* TODO:
* - We need a packer
* - We need a monitoring interface
*/
#ifndef PIPELINE_CUH_
#define PIPELINE_CUH_
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <cuda.h>
#include "psrdada_cpp/cuda_utils.hpp"
#include "psrdada_cpp/multilog.hpp"
#include "psrdada_cpp/raw_bytes.hpp"
#include "psrdada_cpp/cryopaf/Unpacker.cuh"
#include "psrdada_cpp/cryopaf/Beamformer.cuh"
#include "psrdada_cpp/cryopaf/BufferTypes.cuh"
namespace psrdada_cpp{
namespace cryopaf{
struct PipelineConfig{
key_t in_key;
key_t out_key;
int device_id;
std::string logname;
std::size_t n_samples;
std::size_t n_channel;
std::size_t n_elements;
std::size_t n_beam;
std::size_t integration;
std::string input_type;
std::string mode;
std::string protocol;
const std::size_t n_pol = 2;
void print()
{
std::cout << "Pipeline configuration" << std::endl;
std::cout << "in_key: " << in_key << std::endl;
std::cout << "out_key: " << out_key << std::endl;
std::cout << "device_id: " << device_id << std::endl;
std::cout << "logname: " << logname << std::endl;
std::cout << "n_samples: " << n_samples << std::endl;
std::cout << "n_channel: " << n_channel << std::endl;
std::cout << "input_type: " << input_type << std::endl;
std::cout << "n_elements: " << n_elements << std::endl;
std::cout << "n_pol: " << n_pol << std::endl;
std::cout << "n_beam: " << n_beam << std::endl;
std::cout << "integration: " << integration << std::endl;
std::cout << "mode: " << mode << std::endl;
}
};
template<class HandlerType, class ComputeType, class ResultType>
class Pipeline{
// Internal type defintions
private:
typedef RawVoltage<char> RawInputType; // Type for received raw input data (voltage)
typedef RawVoltage<ComputeType> InputType;// Type for unpacked raw input data (voltage)
typedef Weights<ComputeType> WeightType; // Type for beam weights
typedef BeamOutput<ResultType> OutputType;// Type for beamfored output data
public:
/**
* @brief Constructs an object of Pipeline
*
* @param PipelineConfig conf Pipeline configuration containing all necessary parameters (declaration can be found in Types.cuh)
* @param MultiLog log Logging instance
* @param HandlerType handler Object for handling output data
*
* @detail Initializes the pipeline enviroment including device memory and processor objects.
*/
Pipeline(PipelineConfig& conf, MultiLog &log, HandlerType &handler);
/**
* @brief Deconstructs an object of Pipeline
*
* @detail Destroys all objects and allocated memory
*/
~Pipeline();
/**
* @brief Initialise the pipeline with a DADA header block
*
* @param header A RawBytes object wrapping the DADA header block
*/
void init(RawBytes &header_block);
/**
* @brief Process the data in a DADA data buffer
*
* @param data A RawBytes object wrapping the DADA data block
*/
bool operator()(RawBytes &dada_block);
// Internal attributes
private:
HandlerType &_handler;
MultiLog &_log;
PipelineConfig& _conf;
// Processors
Unpacker<ComputeType>* unpacker = nullptr; // Object to unpack and transpose received input data on GPU to an expected format
Beamformer<ComputeType>* beamformer = nullptr; // Object to perform beamforming on GPU
// Buffers
RawInputType *_raw_input_buffer = nullptr; // Received input buffer
InputType *_input_buffer = nullptr; // Unpacked and transposed input buffer
WeightType *_weight_buffer = nullptr; // Beam weights, updated through shared memory
OutputType *_output_buffer = nullptr; // Output buffer containing processed beams
std::size_t _call_cnt = 0; // Internal dada block counter
cudaStream_t _h2d_stream; // Host to device cuda stream (used for async copys)
cudaStream_t _prc_stream; // Processing stream
cudaStream_t _d2h_stream; // Device to host cuda stream (used for async copys)
#ifdef DEBUG
// Time measurement variables (debugging only)
cudaEvent_t start, stop;
float ms;
#endif
};
} // namespace cryopaf
} // namespace psrdada_cpp
#include "psrdada_cpp/cryopaf/src/Pipeline.cu"
#endif /* POWERBEAMFORMER_CUH_ */
#ifndef PIPELINE_INTERFACE_HPP
#define PIPELINE_INTERFACE_HPP
#include <vector>
#include <string>
#include <unistd.h>
#include <random>
#include <cmath>
#include <complex>
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/interprocess/mapped_region.hpp>
#include <boost/interprocess/sync/scoped_lock.hpp>
#include <boost/thread.hpp>
#include "psrdada_cpp/multilog.hpp"
#include "psrdada_cpp/cryopaf/QueueHeader.hpp"
namespace psrdada_cpp{
namespace cryopaf{
namespace bip = boost::interprocess;
struct PipelineInterfaceConfig{
std::string logname;
std::size_t n_channel;
std::size_t n_elements;
std::size_t n_pol;
std::size_t n_beam;
std::string mode;
void print()
{
std::cout << "Pipeline interface configuration" << std::endl;
std::cout << "logname: " << logname << std::endl;
std::cout << "n_channel: " << n_channel << std::endl;
std::cout << "n_elements: " << n_elements << std::endl;
std::cout << "n_pol: " << n_pol << std::endl;
std::cout << "n_beam: " << n_beam << std::endl;
std::cout << "mode: " << mode << std::endl;
}
};
template<class T>
class PipelineInterface
{
public:
PipelineInterface(PipelineInterfaceConfig& config, MultiLog& logger);
~PipelineInterface();
void run();
virtual void update() = 0;
private:
std::string smem_name = "SharedMemoryWeights";
bip::shared_memory_object smem;
bip::mapped_region region;
void* smem_addr = nullptr;
T* smem_weights = nullptr;
QueueHeader *qheader;
protected:
PipelineInterfaceConfig& conf;
MultiLog& log;
std::vector<T> vect_weights;
bool quit = false;
std::size_t update_cnt = 0;
};
template<class T>
class SimpleWeightGenerator : public PipelineInterface<T>
{
public:
SimpleWeightGenerator(PipelineInterfaceConfig& config, MultiLog& logger);
~SimpleWeightGenerator();
void update();
private:
void bypass();
void random();
};
}
}
#include "psrdada_cpp/cryopaf/src/PipelineInterface.cu"
#endif // end PIPELINE_INTERFACE_HPP
/*
* QueueHeader.hpp
* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
* Description:
* This file contains the structure which is used shared memory IPC
*/
#ifndef QUEUE_HEADER_HPP_
#define QUEUE_HEADER_HPP_
#include <boost/interprocess/sync/interprocess_mutex.hpp>
#include <boost/interprocess/sync/interprocess_condition.hpp>
namespace bip = boost::interprocess;
/**
* @brief Header stored in POSIX shared memory to allow IPC communication
*/
struct QueueHeader
{
QueueHeader()
: data_in(false),
stop(true)
{}
// Mutex to protect access to the queue
bip::interprocess_mutex mutex;
// Condition to wait when the queue is empty
bip::interprocess_condition ready_to_read;
// Condition to wait when the queue is full
bip::interprocess_condition ready_to_write;
// Is there any payload?
bool data_in;
// Stop flag, will force the owner to remove the shared memory
bool stop;
};
#endif
#ifndef UNPACKER_CUH
#define UNPACKER_CUH
#include <cuda.h>
#include <cuda_fp16.h>
#include <thrust/device_vector.h>
#include "psrdada_cpp/common.hpp"
#include "psrdada_cpp/multilog.hpp"
#define NCHAN_CHK 7
#define NSAMP_DF 128
#define NPOL_SAMP 2
#define NSAMP_PER_HEAP 1024
namespace psrdada_cpp {
namespace cryopaf {
template<typename T>__global__
void unpack_codif_to_fpte(uint64_t const* __restrict__ idata, T* __restrict__ odata);
template<typename U, typename T>__global__
void unpack_spead_ttfep_to_fpte(U const* __restrict__ idata, T* __restrict__ odata);
template<typename T>
class Unpacker
{
public:
Unpacker(cudaStream_t& stream,
std::size_t nsamples,
std::size_t nchannels,
std::size_t nelements,
std::string protocol);
~Unpacker();
Unpacker(Unpacker const&) = delete;
void unpack(char* input, T* output);
void print_layout();
int sample_size(){return _sample_size;}
private:
cudaStream_t& _stream;
int _sample_size = 8; // TODO: dynamic for spead and fixed for codif
std::string _protocol;
dim3 grid;
dim3 block;
};
} //namespace cryopaf
} //namespace psrdada_cpp
#include "psrdada_cpp/cryopaf/details/UnpackerKernels.cu"
#include "psrdada_cpp/cryopaf/src/Unpacker.cu"
#endif // UNPACKER_CUH
This diff is collapsed.
#ifdef UNPACKER_CUH
namespace psrdada_cpp {
namespace cryopaf{
__device__ __forceinline__ uint64_t swap64(uint64_t x)
{
uint64_t result;
uint2 t;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(t.x), "=r"(t.y) : "l"(x));
t.x = __byte_perm(t.x, 0, 0x0123);
t.y = __byte_perm(t.y, 0, 0x0123);
asm("mov.b64 %0,{%1,%2}; \n\t"
: "=l"(result) : "r"(t.y), "r"(t.x));
return result;
}
template<typename T>__global__
void unpack_codif_to_fpte(uint64_t const* __restrict__ idata, T* __restrict__ odata)
{
int time = threadIdx.x + blockIdx.x * blockDim.x; // Time
int elem = threadIdx.y + blockIdx.y * blockDim.y; // Elements
int freq = threadIdx.z + blockIdx.z * blockDim.z; // Frequency
int chan = blockDim.z * gridDim.z;
int time_in = blockIdx.x * blockDim.x * gridDim.y * chan + threadIdx.x * chan;
int freq_in = freq;
int elem_in = elem * NSAMP_DF * chan ;
int freq_out = freq * NPOL_SAMP * gridDim.x * blockDim.x * gridDim.y;
int time_out = time * gridDim.y;
int in_idx = time_in + freq_in + elem_in;
int out_idx_x = freq_out + time_out + elem;
int out_idx_y = freq_out + gridDim.x * blockDim.x * gridDim.y + time_out + elem;
uint64_t tmp = swap64(idata[in_idx]);
odata[out_idx_x].x = static_cast<decltype(T::x)>((tmp & 0x000000000000ffffLL));
odata[out_idx_x].y = static_cast<decltype(T::y)>((tmp & 0x00000000ffff0000LL) >> 16);
odata[out_idx_y].x = static_cast<decltype(T::x)>((tmp & 0x0000ffff00000000LL) >> 32);
odata[out_idx_y].y = static_cast<decltype(T::y)>((tmp & 0xffff000000000000LL) >> 48);
}
template<typename U, typename T>__global__
void unpack_spead_ttfep_to_fpte(U const* __restrict__ idata, T* __restrict__ odata)
{
int time = threadIdx.x; // Time
int elem = blockIdx.y; // Elements
int freq = blockIdx.z; // Frequency
int heap_idx = blockIdx.x;
int in_idx = heap_idx * NSAMP_PER_HEAP * gridDim.z * gridDim.y * NPOL_SAMP // Outer time axis
+ time * gridDim.z * gridDim.y * NPOL_SAMP // Inner time axis
+ freq * gridDim.y * NPOL_SAMP // Frequency axis
+ elem * NPOL_SAMP; // Element axis
int out_idx_x = freq * NPOL_SAMP * gridDim.x * NSAMP_PER_HEAP * gridDim.y // Frequency axis
+ (time + blockIdx.x * blockDim.x) * gridDim.y
+ elem;
int out_idx_y = freq * NPOL_SAMP * gridDim.x * NSAMP_PER_HEAP * gridDim.y // Frequency axis
+ gridDim.x * NSAMP_PER_HEAP * gridDim.y
+ (time + blockIdx.x * blockDim.x) * gridDim.y
+ elem;
odata[out_idx_x].x = static_cast<decltype(T::x)>(idata[in_idx].x);
odata[out_idx_x].y = static_cast<decltype(T::y)>(idata[in_idx].y);
odata[out_idx_y].x = static_cast<decltype(T::x)>(idata[in_idx + 1].x);
odata[out_idx_y].y = static_cast<decltype(T::y)>(idata[in_idx + 1].y);
}
// ######################################################
// NOTE: Kernels above are deprecated and not longer used
// ######################################################
/*
template<typename T>__global__
void unpack_codif_to_tfep(uint64_t const* __restrict__ idata, T* __restrict__ odata)
{
int time = threadIdx.x + blockIdx.x * blockDim.x; // Time
int elem = threadIdx.y + blockIdx.y * blockDim.y; // Elements
int freq = threadIdx.z + blockIdx.z * blockDim.z; // Frequency
int chan = blockDim.z * gridDim.z;
int time_in = blockIdx.x * blockDim.x * gridDim.y * chan + threadIdx.x * chan;
int freq_in = freq;
int elem_in = elem * NSAMP_DF * chan ;
int time_out = time * chan * gridDim.y * NPOL_SAMP;
int freq_out = freq * gridDim.y * NPOL_SAMP;
int elem_out = elem * NPOL_SAMP;
int in_idx = time_in + freq_in + elem_in;
int out_idx = time_out + freq_out + elem_out;
uint64_t tmp = swap64(idata[in_idx]);
odata[out_idx].x = static_cast<decltype(T::x)>((tmp & 0x000000000000ffffLL));
odata[out_idx].y = static_cast<decltype(T::y)>((tmp & 0x00000000ffff0000LL) >> 16);
odata[out_idx + 1].x = static_cast<decltype(T::x)>((tmp & 0x0000ffff00000000LL) >> 32);
odata[out_idx + 1].y = static_cast<decltype(T::y)>((tmp & 0xffff000000000000LL) >> 48);
}
*/
}
}
#endif
#ifndef UTILS_CU
#define UTILS_CU
/** UTILS **/
__device__ __half2 __hCmul2(__half2 a, __half2 b)
{
const __half r = a.x * b.x - a.y * b.y;
const __half i = a.x * b.y + a.y * b.x;
__half2 val; val.x = r; val.y = i;
return val;
}
template<typename T>
__host__ __device__ T cmadd(T a, T b, T c)
{
T val;
val.x = a.x * b.x - a.y * b.y + c.x;
val.y = a.x * b.y + a.y * b.x + c.y;
return val;
}
template<typename T>
__host__ __device__ T cadd(T a, T b)
{
T val;
val.x = a.x + b.x;
val.y = a.y + b.y;
return val;
}
template<typename T>
__host__ __device__ T csub(T a, T b)
{
T val;
val.x = a.x - b.x;
val.y = a.y - b.y;
return val;
}
template<typename T>
__host__ __device__ double cabs(T a)
{
return (double)sqrt((double)(a.x * a.x + a.y * a.y));
}
#endif
set(profiling_src
KernelStatistics.cuh
profiling.cu
)
cuda_add_executable(profiling ${profiling_src} )
target_link_libraries(profiling ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
#ifndef KERNEL_STATISTICS_CUH
#define KERNEL_STATISTICS_CUH
#include <fstream>
#include <cuda.h>
#include "psrdada_cpp/cuda_utils.hpp"
struct ProfileConfig{
int device_id;
std::size_t n_samples;
std::size_t n_channel;
std::size_t n_elements;
std::size_t n_beam;
std::size_t integration;
std::string precision;
std::string protocol;
std::string out_dir;
const std::size_t n_pol = 2;
};
class KernelProfiler
{
public:
KernelProfiler(ProfileConfig& config,
cudaStream_t& cuda_stream,
std::string kernel_name,
std::size_t complexity,
std::size_t read_size,
std::size_t write_size,
std::size_t input_size);
~KernelProfiler();
void measure_start();
void measure_stop();
void update(float time_ms);
void finialize();
void export_to_csv(std::string filename);
private:
ProfileConfig& conf;
cudaStream_t& stream;
cudaDeviceProp prop;
std::string name;
std::string head_line;
cudaEvent_t start, stop;
std::vector<float> elapsed_time;
std::vector<float> compute_tput;
std::vector<float> memory_bw;
std::vector<float> input_bw;
std::vector<float> output_bw;
float peak_mem_bandwidth;
float ms = 0;
float avg_time = 0;
float min_time = 0;
float max_time = 0;
float avg_tput = 0;
float min_tput = 0;
float max_tput = 0;
float avg_m_bw = 0;
float min_m_bw = 0;
float max_m_bw = 0;
float avg_m_bw_perc = 0;
float min_m_bw_perc = 0;
float max_m_bw_perc = 0;
float avg_i_bw = 0;
float min_i_bw = 0;
float max_i_bw = 0;
float avg_o_bw = 0;
float min_o_bw = 0;
float max_o_bw = 0;
std::size_t iterations = 0;
std::size_t reads;
std::size_t writes;
std::size_t input_sz;
std::size_t compute_complexity;
};
#include "psrdada_cpp/cryopaf/profiling/src/KernelStatistics.cu"
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/complex.h>
#include <cuda.h>
#include <random>
#include <cmath>
#include <fstream>
#include <chrono>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include "psrdada_cpp/cuda_utils.hpp"
#include "psrdada_cpp/multilog.hpp"
#include "psrdada_cpp/cryopaf/Unpacker.cuh"
#include "psrdada_cpp/cryopaf/Beamformer.cuh"
#include "psrdada_cpp/cryopaf/profiling/KernelStatistics.cuh"
const size_t ERROR_IN_COMMAND_LINE = 1;
const size_t SUCCESS = 0;
using namespace psrdada_cpp;
using namespace psrdada_cpp::cryopaf;
using namespace std::chrono;
template<typename T>
void profile(ProfileConfig conf, std::size_t iter)
{
// If template parameter is not of a complex dtype profiling has to be aborted
if( !(std::is_same<T, float2>::value)
&& !(std::is_same<T, __half2>::value))
{
BOOST_LOG_TRIVIAL(error) << "ProfilingError: Template type not supported";
exit(1);
}
cudaStream_t stream;
CUDA_ERROR_CHECK(cudaStreamCreate(&stream));
// Instantiate processor objects
Beamformer<T> beamformer(stream,
conf.n_samples,
conf.n_channel,
conf.n_elements,
conf.n_beam,
conf.integration);
Unpacker<T> unpacker(stream,
conf.n_samples,
conf.n_channel,
conf.n_elements,
conf.protocol);
// Calulate memory size for input, weights and output
std::size_t input_size = conf.n_samples
* conf.n_elements
* conf.n_channel
* conf.n_pol;
std::size_t weight_size = conf.n_beam
* conf.n_elements
* conf.n_channel
* conf.n_pol;
std::size_t output_size = conf.n_samples
* conf.n_beam
* conf.n_channel;
std::size_t required_mem = input_size * sizeof(T)
+ weight_size * sizeof(T)
+ output_size * sizeof(decltype(T::x));
BOOST_LOG_TRIVIAL(debug) << "Required device memory: " << std::to_string(required_mem / (1024*1024)) << "MiB";
// Allocate device memory
thrust::device_vector<char> input_up(input_size * unpacker.sample_size(),0);
thrust::device_vector<T> input_bf(input_size, {.1, .1});
thrust::device_vector<T> weights_bf(weight_size, {.1, .1});
thrust::device_vector<T> output_bf_voltage(output_size * conf.n_pol);
thrust::device_vector<decltype(T::x)> output_bf_power(output_size / conf.integration, 0);
// Calculation of compute complexity
std::size_t n = conf.n_beam
* conf.n_channel
* conf.n_elements
* conf.n_pol
* conf.n_samples;
std::size_t complexity_bf_vol = 8 * n;
std::size_t complexity_bf_pow = 8 * n + 4 * n /** + accumulation **/;
std::size_t complexity_unpack = n / conf.n_beam;
// Create KernelStatistics object for each kernel
KernelProfiler voltage_profiler(conf, stream,
"Voltage",
complexity_bf_vol,
(input_bf.size() + weights_bf.size()) * sizeof(T),
output_bf_voltage.size() * sizeof(T),
input_bf.size() * sizeof(T));
KernelProfiler power_profiler(conf, stream,
"StokesI",
complexity_bf_pow,
(input_bf.size() + weights_bf.size()) * sizeof(T),
output_bf_power.size() * sizeof(decltype(T::x)),
input_bf.size() * sizeof(T));
KernelProfiler unpack_profiler(conf, stream,
"Unpacker",
complexity_unpack,
input_up.size() * sizeof(uint64_t),
input_bf.size() * sizeof(T),
input_up.size() * sizeof(uint64_t));
// Run all used kernels i-times
for(int i = 0; i < iter; i++)
{
// Call Stokes I detection beamformer kernel
power_profiler.measure_start();
beamformer.process(
thrust::raw_pointer_cast(input_bf.data()),
thrust::raw_pointer_cast(weights_bf.data()),
thrust::raw_pointer_cast(output_bf_power.data()));
power_profiler.measure_stop();
// Call to voltage beamformer kernel
voltage_profiler.measure_start();
beamformer.process(
thrust::raw_pointer_cast(input_bf.data()),
thrust::raw_pointer_cast(weights_bf.data()),
thrust::raw_pointer_cast(output_bf_voltage.data()));
voltage_profiler.measure_stop();
// Call to unpacking kernel
unpack_profiler.measure_start();
unpacker.unpack(
thrust::raw_pointer_cast(input_up.data()),
thrust::raw_pointer_cast(input_bf.data()));
unpack_profiler.measure_stop();
}
CUDA_ERROR_CHECK(cudaStreamDestroy(stream));
power_profiler.finialize();
voltage_profiler.finialize();
unpack_profiler.finialize();
power_profiler.export_to_csv(conf.out_dir + "power_kernel.csv");
voltage_profiler.export_to_csv(conf.out_dir + "voltage_kernel.csv");
unpack_profiler.export_to_csv(conf.out_dir + "unpacker_kernel.csv");
}
int main(int argc, char** argv)
{
// Variables to store command line options
ProfileConfig conf;
int iter;
std::string precision;
std::string filename;
// Parse command line
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help,h", "Print help messages")
("samples", po::value<std::size_t>(&conf.n_samples)->default_value(1024), "Number of samples within one batch")
("channels", po::value<std::size_t>(&conf.n_channel)->default_value(14), "Number of channels")
("elements", po::value<std::size_t>(&conf.n_elements)->default_value(WARP_SIZE*4), "Number of elements")
("beams", po::value<std::size_t>(&conf.n_beam)->default_value(64), "Number of beams")
("integration", po::value<std::size_t>(&conf.integration)->default_value(1), "Integration interval; must be multiple 2^n and smaller 32")
("device", po::value<int>(&conf.device_id)->default_value(0), "ID of GPU device")
("protocol", po::value<std::string>(&conf.protocol)->default_value("codif"), "Protocol of input data; supported protocol 'codif'")
("iteration", po::value<int>(&iter)->default_value(5), "Iterations to run")
("precision", po::value<std::string>(&conf.precision)->default_value("half"), "Compute type of GEMM operation; supported precisions 'half' and 'single'")
("outdir", po::value<std::string>(&conf.out_dir)->default_value("Results/"), "Output directory to store csv files");
po::variables_map vm;
try
{
po::store(po::parse_command_line(argc, argv, desc), vm);
if ( vm.count("help") )
{
std::cout << "Beamform Profiling" << std::endl
<< desc << std::endl;
return SUCCESS;
}
po::notify(vm);
}
catch(po::error& e)
{
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return ERROR_IN_COMMAND_LINE;
}
if(conf.precision == "half")
{
profile<__half2>(conf, iter);
}
else if(conf.precision == "single")
{
profile<float2>(conf, iter);
}
else
{
BOOST_LOG_TRIVIAL(error) << "Compute type " << precision << " not implemented";
}
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment