Merged cryopaf into devel

c8b3d3b6 · Niclas Esser · 6a287b3c · c8b3d3b6 · c8b3d3b6 · c8b3d3b6
Commit c8b3d3b6 authored 4 years ago by Niclas Esser
--- a/psrdada_cpp/CMakeLists.txt
+++ b/psrdada_cpp/CMakeLists.txt
@@ -88,3 +88,4 @@ install(DIRECTORY detail DESTINATION include/psrdada_cpp)

 add_subdirectory(meerkat)
 add_subdirectory(effelsberg)
+add_subdirectory(cryopaf)
--- a/psrdada_cpp/cryopaf/Beamformer.cuh
+++ b/psrdada_cpp/cryopaf/Beamformer.cuh
+/*
+* Beamformer.cuh
+* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
+* Description:
+*  This file consists of a single class (Beamformer<ComputeType>). An object of Beamformer
+*  can be used o either perform a Stokes I detection or raw voltage beamforming
+*  on a GPU.
+*  Both beamforming kernels expect the same dataproduct (linear aligned in device memory)
+*    Input:  F-P-T-E
+*    Weight: F-P-B-E
+*    Output: F-T-B-P (voltage beams)
+*    Output: F-T-B   (Stokes I beams)
+*/
+
+#ifndef BEAMFORMER_CUH_
+#define BEAMFORMER_CUH_
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <thrust/device_vector.h>
+
+#include "psrdada_cpp/cuda_utils.hpp"
+#include "psrdada_cpp/multilog.hpp"
+
+namespace psrdada_cpp{
+namespace cryopaf{
+
+// Constants for beamform kernels
+#define NTHREAD 1024
+#define TILE_SIZE 32
+#define WARP_SIZE 32
+
+/**
+* @brief 	GPU kernel to perform Stokes I detection beamforming
+*
+* @detail Template type T has to be etiher T=float2 or T=__half2.
+*         According to T, U has to be either U=float or U=__half
+*
+* @param	T* idata       pointer to input memory (format: F-P-T-E)
+* @param	T* wdata       pointer to beam weight memory (format: F-P-B-E)
+* @param	U* odata	     pointer to output memory type of U is equal to T::x (format: F-T-B)
+* @param	int time	     Width of time dimension (T)
+* @param	int elem       Number of elements (E)
+* @param	int beam       Number of beams (B)
+* @param	int integrate  Integration time, currently limited to 32 and a power of 2
+*
+* @TODO: Allow greater integration time
+*/
+template<typename T, typename U>__global__
+void beamformer_power_fpte_fpbe_ftb(
+  const T *idata,
+  const T* wdata,
+  U *odata,
+  int time,
+  int elem,
+  int beam,
+  int integrate);
+
+
+/**
+* @brief 	GPU kernel to perform raw voltage beamforming
+*
+* @detail Template type T has to be etiher T=float2 or T=__half2
+*
+* @param	T* idata       pointer to input memory (format: F-P-T-E)
+* @param	T* wdata       pointer to beam weight memory (format: F-P-B-E)
+* @param	T* odata	     pointer to output memory (format: F-T-B)
+* @param	int time	     Width of time dimension (T)
+* @param	int elem       Number of elements (E)
+* @param	int beam       Number of beams (B)
+*/
+template<typename T>__global__
+void beamformer_voltage_fpte_fpbe_fptb(
+  const T *idata,
+  const T* wdata,
+  T *odata,
+  int time,
+  int elem,
+  int beam);
+
+
+template<class ComputeType>
+class Beamformer{
+
+// Internal typedefintions
+private:
+  typedef decltype(ComputeType::x) ResultType; // Just necessary for Stokes I beamformer
+
+// Public functions
+public:
+  /**
+  * @brief  constructs an object of Beamformer<ComputeType> (ComputeType=float2 or ComputeType=__half2)
+  *
+  * @param	cudaStream_t& stream      Object of cudaStream_t to allow parallel copy + processing (has to be created and destroyed elsewhere)
+  * @param  std::size_t sample        Number of samples to process in on kernel launch (no restrictions)
+  * @param  std::size_t channel       Number of channels to process in on kernel launch (no restrictions)
+  * @param  std::size_t element       Number of elements to process in on kernel launch (no restrictions)
+  * @param  std::size_t beam          Number of beams to process in on kernel launch (no restrictions)
+  * @param  std::size_t integration   Samples to be integrated, has to be power of 2 and smaller 32
+  */
+  Beamformer(
+    cudaStream_t& stream,
+    std::size_t sample,
+    std::size_t channel,
+    std::size_t element,
+    std::size_t beam,
+    std::size_t integration = 1);
+
+  /**
+  * @brief  deconstructs an object of Beamformer<ComputeType> (ComputeType=float2 or ComputeType=__half2)
+  */
+  ~Beamformer();
+
+  /**
+  * @brief 	Launches voltage beamforming GPU kernel
+  *
+  * @param	ComputeType* input       pointer to input memory (format: F-P-T-E)
+  * @param	ComputeType* weights     pointer to beam weight memory (format: F-P-B-E)
+  * @param	ComputeType* output	     pointer to output memory (format: F-T-B-P)
+  */
+  void process(
+    const ComputeType* input,
+    const ComputeType* weights,
+    ComputeType* output);
+
+  /**
+  * @brief 	Launches Stokes I beamforming GPU kernel
+  *
+  * @param	ComputeType* input       pointer to input memory (format: F-P-T-E)
+  * @param	ComputeType* weights     pointer to beam weight memory (format: F-P-B-E)
+  * @param	ResultType* output	     pointer to output memory (format: F-T-B)
+  */
+  void process(
+    const ComputeType* input,
+    const ComputeType* weights,
+    ResultType* output);
+
+  /**
+	* @brief	Prints the block and grid layout of a kernel (used for debugging purposes)
+	*/
+	void print_layout();
+
+// Private attributes
+private:
+  cudaStream_t& _stream;
+  dim3 grid;
+  dim3 block;
+  std::size_t _sample;
+  std::size_t _channel;
+  std::size_t _element;
+  std::size_t _beam;
+  std::size_t _integration;
+};
+
+
+} // namespace cryopaf
+} // namespace psrdada_cpp
+
+#include "psrdada_cpp/cryopaf/details/utils.cu"
+#include "psrdada_cpp/cryopaf/details/BeamformerKernels.cu"
+#include "psrdada_cpp/cryopaf/src/Beamformer.cu"
+
+#endif /* BEAMFORMER_CUH_ */
--- a/psrdada_cpp/cryopaf/BufferTypes.cuh
+++ b/psrdada_cpp/cryopaf/BufferTypes.cuh
+/*
+* BufferTypes.cuh
+* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
+* Description:
+*   This file contains classes for different kinds of buffer representation.
+*   All implemented classes inherit from DoubleBuffer<thrust::device_vector<T>>.
+*/
+
+#ifndef BUFFERTYPES_HPP_
+#define BUFFERTYPES_HPP_
+
+// boost::interprocess used to upload weights via POSIX shared memory
+#include <boost/interprocess/shared_memory_object.hpp>
+#include <boost/interprocess/mapped_region.hpp>
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <boost/thread.hpp>
+
+#include "psrdada_cpp/cuda_utils.hpp"
+#include "psrdada_cpp/double_device_buffer.cuh"
+#include "psrdada_cpp/double_host_buffer.cuh"
+#include "psrdada_cpp/cryopaf/QueueHeader.hpp"
+
+namespace psrdada_cpp {
+namespace cryopaf{
+
+/**
+* @brief  Class providing buffers for raw voltage data
+*/
+template<class T>
+class RawVoltage : public DoubleBuffer<thrust::device_vector<T>>
+{
+public:
+    typedef T type;
+public:
+    /**
+    * @brief	Instantiates an object of RawVoltage
+    *
+    * @param	std::size_t  Number of items in buffer
+    *
+    * @detail Allocates twice the size in device memory as double device buffer
+    */
+    RawVoltage(std::size_t size)
+      : DoubleBuffer<thrust::device_vector<T>>()
+    {
+      this->resize(size);
+      _bytes = size * sizeof(T);
+    }
+    /**
+    * @brief	Destroys an object of RawVoltage
+    */
+    ~RawVoltage(){}
+    /**
+    * @brief	Returns the number of bytes used for a single buffer
+    *
+    * @detail The occupied memory is twice
+    */
+    std::size_t total_bytes(){return _bytes;}
+private:
+    std::size_t _bytes;
+};
+
+
+/**
+* @brief  Class providing buffers for beam data (is always the Output of the Pipeline)
+* @detail An object of BeamOutput also contains an instance of DoublePinnedHostBuffer<T>
+*         to allow an asynchronous copy to the host memory.
+*/
+template<class T>
+class BeamOutput : public DoubleBuffer<thrust::device_vector<T>>
+{
+
+public:
+    typedef T type;
+    DoublePinnedHostBuffer<T> host;
+public:
+    /**
+    * @brief	Instantiates an object of BeamOutput
+    *
+    * @param	std::size_t  Number of items in buffer
+    *
+    * @detail Allocates twice the size in device memory and in host memory as double buffers
+    */
+    BeamOutput(std::size_t size)
+      : DoubleBuffer<thrust::device_vector<T>>()
+    {
+      this->resize(size);
+      host.resize(size);
+      _bytes = size * sizeof(T);
+    }
+    /**
+    * @brief	Destroys an object of BeamOutput
+    */
+    ~BeamOutput(){}
+
+    /**
+    * @brief	Asynchronous copy to host memory
+    *
+    * @param	cudaStream_t& stream  Device to host stream
+    */
+    void async_copy(cudaStream_t& stream)
+    {
+        CUDA_ERROR_CHECK(cudaMemcpyAsync(host.a_ptr(), this->a_ptr(), _bytes, cudaMemcpyDeviceToHost, stream));
+    }
+    /**
+    * @brief	Returns the number of bytes used for a single buffer
+    *
+    * @detail The occupied memory is twice
+    */
+    std::size_t total_bytes(){return _bytes;}
+private:
+    std::size_t _bytes;
+};
+
+// Define namespace for convinient access to boost::interprocess functionalitys, just used for weights
+namespace bip = boost::interprocess;
+
+/**
+* @brief  Class providing buffers for beam weights
+* @detail An object of Weights as the ability to read out a POSIX shared memory namespace
+*         to load updated beam weights.
+* @note   The current state is not final and will change in future. The idea for future is
+*         to provide an update method which is called by a shared memory instance.
+*/
+template<class T>
+class Weights : public DoubleBuffer<thrust::device_vector<T>>
+{
+public:
+    typedef T type;
+public:
+
+    /**
+    * @brief	Instantiates an object of BeamOutput
+    *
+    * @param	std::size_t  Number of items in buffer
+    * @param	std::string  Name of the POSIX shared memory
+    *
+    * @detail Allocates twice the size in device memory as double device buffer.
+    *         It also launches a boost::thread to create, read and write from shared
+    *         memory.
+    */
+    Weights(std::size_t size, std::string smem_name="SharedMemoryWeights")
+      : DoubleBuffer<thrust::device_vector<T>>()
+      , _smem_name(smem_name)
+    {
+
+      this->resize(size);
+      _bytes = size * sizeof(T);
+      t = new boost::thread(boost::bind(&Weights::run, this));
+    }
+    /**
+    * @brief	Destroys an object of BeamOutput
+    */
+    ~Weights(){}
+
+    /**
+    * @brief	Creates, read, write and removes a POSIX shared memory space
+    *
+    * @detail This function is a temporary solution to update beam weights on-the-fly
+    *         while the pipeline is operating. In the future a clean interface will be created
+    *         that provides addtional monitoring informations (e.g. power level) besides the
+    *         beam weight updating mechanism.
+    */
+    void run()
+    {
+
+      bip::shared_memory_object::remove("SharedMemoryWeights");
+      bip::shared_memory_object smem(bip::create_only, "SharedMemoryWeights", bip::read_write);
+
+      // Set size of shared memory including QueueHeader + payload
+      BOOST_LOG_TRIVIAL(info) << "Size of shared memory for weight uploading (IPC) " << sizeof(QueueHeader) + (this->size()) * sizeof(T);
+      smem.truncate(sizeof(QueueHeader) + (this->size()) * sizeof(T));
+
+      // Map shared memory to a addressable region
+      bip::mapped_region region(smem, bip::read_write);
+
+      void* smem_addr = region.get_address(); // get it's address
+
+      QueueHeader* qheader = static_cast<QueueHeader*>(smem_addr);  // Interpret first bytes as QueueHeader
+      T *ptr = &(static_cast<T*>(smem_addr)[sizeof(QueueHeader)]); // Pointer to address of payload (behind QueueHeader)
+      qheader->stop = true;
+      while(qheader->stop){usleep(1000);}
+      while(!qheader->stop)
+      {
+        bip::scoped_lock<bip::interprocess_mutex> lock(qheader->mutex);
+        if(!qheader->data_in)
+        {
+          BOOST_LOG_TRIVIAL(debug) << "Waiting for writing weights to shared memory";
+          qheader->ready_to_read.wait(lock); // Wait for read out
+        }
+
+        BOOST_LOG_TRIVIAL(debug) << "Reading new weights from shared memory";
+        CUDA_ERROR_CHECK(cudaMemcpy((void*)this->b_ptr(), (void*)ptr,
+            _bytes, cudaMemcpyHostToDevice));
+        // Swap double buffer, so next batch is calculated with new weights
+        this->swap();
+        //Notify the other process that the buffer is empty
+        qheader->data_in = false;
+        qheader->ready_to_write.notify_all();
+
+      }
+      bip::shared_memory_object::remove("SharedMemoryWeights");
+      BOOST_LOG_TRIVIAL(info) << "Closed shared memory for weights uploading";
+    }
+
+    std::size_t total_bytes(){return _bytes;}
+
+private:
+    std::size_t _bytes;
+    std::string _smem_name;
+    boost::thread *t;
+};
+
+}
+}
+
+
+
+#endif /* BUFFERTYPES_HPP_ */
--- a/psrdada_cpp/cryopaf/CMakeLists.txt
+++ b/psrdada_cpp/cryopaf/CMakeLists.txt
+if(ENABLE_CUDA)
+set(PSRDADA_CPP_CRYOPAF_LIBRARIES
+    ${CMAKE_PROJECT_NAME}
+    ${CMAKE_PROJECT_NAME}_cryopaf
+    ${DEPENDENCY_LIBRARIES}
+    -lboost_system
+    -lpthread)
+
+set(psrdada_cpp_cryopaf_src
+    Unpacker.cuh
+    Beamformer.cuh
+    BufferTypes.cuh
+    Pipeline.cuh
+    PipelineInterface.cuh)
+
+cuda_add_library(${CMAKE_PROJECT_NAME}_cryopaf ${psrdada_cpp_cryopaf_src})
+
+cuda_add_executable(beamforming src/beamforming_cli.cu)
+target_link_libraries(beamforming ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
+install(TARGETS beamforming DESTINATION bin)
+
+cuda_add_executable(weightupdater src/weightupdater_cli.cu)
+target_link_libraries(weightupdater ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
+install(TARGETS weightupdater DESTINATION bin)
+
+add_subdirectory(profiling)
+add_subdirectory(test)
+
+endif(ENABLE_CUDA)
--- a/psrdada_cpp/cryopaf/Pipeline.cuh
+++ b/psrdada_cpp/cryopaf/Pipeline.cuh
+/*
+* Pipeline.cuh
+* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
+* Description:
+*  This files consists of a single class (Pipeline<HandlerType, ComputeType, ResultType>)
+*  and a configuration structure (PipelineConfig).
+*  An object of Pipeline is used to access data from psrdada buffers, unpacks them,
+*  performs beamforming and writes the results back to another psrdada buffer.
+*  TODO:
+*    - We need a packer
+*    - We need a monitoring interface
+*/
+#ifndef PIPELINE_CUH_
+#define PIPELINE_CUH_
+
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <cuda.h>
+
+#include "psrdada_cpp/cuda_utils.hpp"
+#include "psrdada_cpp/multilog.hpp"
+#include "psrdada_cpp/raw_bytes.hpp"
+
+#include "psrdada_cpp/cryopaf/Unpacker.cuh"
+#include "psrdada_cpp/cryopaf/Beamformer.cuh"
+#include "psrdada_cpp/cryopaf/BufferTypes.cuh"
+
+namespace psrdada_cpp{
+namespace cryopaf{
+
+struct PipelineConfig{
+   key_t in_key;
+   key_t out_key;
+   int device_id;
+   std::string logname;
+   std::size_t n_samples;
+   std::size_t n_channel;
+   std::size_t n_elements;
+   std::size_t n_beam;
+   std::size_t integration;
+   std::string input_type;
+   std::string mode;
+   std::string protocol;
+   const std::size_t n_pol = 2;
+   void print()
+   {
+     std::cout << "Pipeline configuration" << std::endl;
+     std::cout << "in_key: " << in_key << std::endl;
+     std::cout << "out_key: " << out_key  << std::endl;
+     std::cout << "device_id: " << device_id << std::endl;
+     std::cout << "logname: " << logname << std::endl;
+     std::cout << "n_samples: " << n_samples << std::endl;
+     std::cout << "n_channel: " << n_channel << std::endl;
+     std::cout << "input_type: " << input_type << std::endl;
+     std::cout << "n_elements: " << n_elements << std::endl;
+     std::cout << "n_pol: " << n_pol << std::endl;
+     std::cout << "n_beam: " << n_beam << std::endl;
+     std::cout << "integration: " << integration << std::endl;
+     std::cout << "mode: " << mode << std::endl;
+   }
+};
+
+
+template<class HandlerType, class ComputeType, class ResultType>
+class Pipeline{
+// Internal type defintions
+private:
+  typedef RawVoltage<char> RawInputType;    // Type for received raw input data (voltage)
+	typedef RawVoltage<ComputeType> InputType;// Type for unpacked raw input data (voltage)
+  typedef Weights<ComputeType> WeightType;  // Type for beam weights
+  typedef BeamOutput<ResultType> OutputType;// Type for beamfored output data
+public:
+
+
+	/**
+	* @brief	Constructs an object of Pipeline
+	*
+  * @param	PipelineConfig conf  Pipeline configuration containing all necessary parameters (declaration can be found in Types.cuh)
+  * @param	MultiLog log         Logging instance
+	* @param	HandlerType	handler  Object for handling output data
+	*
+	* @detail	Initializes the pipeline enviroment including device memory and processor objects.
+	*/
+	Pipeline(PipelineConfig& conf, MultiLog &log, HandlerType &handler);
+
+
+	/**
+	* @brief	Deconstructs an object of Pipeline
+  *
+	* @detail	Destroys all objects and allocated memory
+	*/
+	~Pipeline();
+
+
+  /**
+   * @brief      Initialise the pipeline with a DADA header block
+   *
+   * @param      header  A RawBytes object wrapping the DADA header block
+   */
+	void init(RawBytes &header_block);
+
+
+  /**
+   * @brief      Process the data in a DADA data buffer
+   *
+   * @param      data  A RawBytes object wrapping the DADA data block
+   */
+	bool operator()(RawBytes &dada_block);
+// Internal attributes
+private:
+
+	HandlerType &_handler;
+	MultiLog &_log;
+	PipelineConfig& _conf;
+  // Processors
+  Unpacker<ComputeType>* unpacker = nullptr; // Object to unpack and transpose received input data on GPU to an expected format
+	Beamformer<ComputeType>* beamformer = nullptr; // Object to perform beamforming on GPU
+  // Buffers
+	RawInputType *_raw_input_buffer = nullptr; // Received input buffer
+	InputType *_input_buffer = nullptr;  // Unpacked and transposed input buffer
+	WeightType *_weight_buffer = nullptr; // Beam weights, updated through shared memory
+  OutputType *_output_buffer = nullptr; // Output buffer containing processed beams
+
+  std::size_t _call_cnt = 0; // Internal dada block counter
+
+	cudaStream_t _h2d_stream;  // Host to device cuda stream (used for async copys)
+	cudaStream_t _prc_stream;  // Processing stream
+	cudaStream_t _d2h_stream;  // Device to host cuda stream (used for async copys)
+#ifdef DEBUG
+  // Time measurement variables (debugging only)
+	cudaEvent_t start, stop;
+	float ms;
+#endif
+};
+
+
+} // namespace cryopaf
+} // namespace psrdada_cpp
+
+#include "psrdada_cpp/cryopaf/src/Pipeline.cu"
+
+#endif /* POWERBEAMFORMER_CUH_ */
--- a/psrdada_cpp/cryopaf/PipelineInterface.cuh
+++ b/psrdada_cpp/cryopaf/PipelineInterface.cuh
+#ifndef PIPELINE_INTERFACE_HPP
+#define PIPELINE_INTERFACE_HPP
+
+#include <vector>
+#include <string>
+#include <unistd.h>
+#include <random>
+#include <cmath>
+#include <complex>
+
+#include <boost/interprocess/shared_memory_object.hpp>
+#include <boost/interprocess/mapped_region.hpp>
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <boost/thread.hpp>
+
+#include "psrdada_cpp/multilog.hpp"
+#include "psrdada_cpp/cryopaf/QueueHeader.hpp"
+
+namespace psrdada_cpp{
+namespace cryopaf{
+
+namespace bip = boost::interprocess;
+
+struct PipelineInterfaceConfig{
+   std::string logname;
+   std::size_t n_channel;
+   std::size_t n_elements;
+   std::size_t n_pol;
+   std::size_t n_beam;
+   std::string mode;
+   void print()
+   {
+     std::cout << "Pipeline interface configuration" << std::endl;
+     std::cout << "logname: " << logname << std::endl;
+     std::cout << "n_channel: " << n_channel << std::endl;
+     std::cout << "n_elements: " << n_elements << std::endl;
+     std::cout << "n_pol: " << n_pol << std::endl;
+     std::cout << "n_beam: " << n_beam << std::endl;
+     std::cout << "mode: " << mode << std::endl;
+   }
+};
+
+template<class T>
+class PipelineInterface
+{
+public:
+  PipelineInterface(PipelineInterfaceConfig& config, MultiLog& logger);
+  ~PipelineInterface();
+  void run();
+  virtual void update() = 0;
+
+private:
+  std::string smem_name = "SharedMemoryWeights";
+
+  bip::shared_memory_object smem;
+  bip::mapped_region region;
+  void* smem_addr = nullptr;
+  T* smem_weights = nullptr;
+  QueueHeader *qheader;
+
+protected:
+  PipelineInterfaceConfig& conf;
+  MultiLog& log;
+  std::vector<T> vect_weights;
+  bool quit = false;
+  std::size_t update_cnt = 0;
+};
+
+
+template<class T>
+class SimpleWeightGenerator : public PipelineInterface<T>
+{
+public:
+  SimpleWeightGenerator(PipelineInterfaceConfig& config, MultiLog& logger);
+  ~SimpleWeightGenerator();
+  void update();
+private:
+  void bypass();
+  void random();
+};
+
+}
+}
+#include "psrdada_cpp/cryopaf/src/PipelineInterface.cu"
+#endif // end PIPELINE_INTERFACE_HPP
--- a/psrdada_cpp/cryopaf/QueueHeader.hpp
+++ b/psrdada_cpp/cryopaf/QueueHeader.hpp
+/*
+* QueueHeader.hpp
+* Author: Niclas Esser <nesser@mpifr-bonn.mpg.de>
+* Description:
+*   This file contains the structure which is used shared memory IPC
+*/
+#ifndef QUEUE_HEADER_HPP_
+#define QUEUE_HEADER_HPP_
+
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+
+namespace bip = boost::interprocess;
+
+/**
+* @brief    Header stored in POSIX shared memory to allow IPC communication
+*/
+struct QueueHeader
+{
+   QueueHeader()
+      : data_in(false),
+      stop(true)
+   {}
+   // Mutex to protect access to the queue
+   bip::interprocess_mutex mutex;
+   // Condition to wait when the queue is empty
+   bip::interprocess_condition ready_to_read;
+   // Condition to wait when the queue is full
+   bip::interprocess_condition ready_to_write;
+   // Is there any payload?
+   bool data_in;
+   // Stop flag, will force the owner to remove the shared memory
+   bool stop;
+};
+
+#endif
--- a/psrdada_cpp/cryopaf/Unpacker.cuh
+++ b/psrdada_cpp/cryopaf/Unpacker.cuh
+#ifndef UNPACKER_CUH
+#define UNPACKER_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <thrust/device_vector.h>
+
+#include "psrdada_cpp/common.hpp"
+#include "psrdada_cpp/multilog.hpp"
+
+#define NCHAN_CHK             7
+#define NSAMP_DF              128
+#define NPOL_SAMP             2
+#define NSAMP_PER_HEAP        1024
+
+namespace psrdada_cpp {
+namespace cryopaf {
+
+template<typename T>__global__
+void unpack_codif_to_fpte(uint64_t const* __restrict__ idata, T* __restrict__ odata);
+
+template<typename U, typename T>__global__
+void unpack_spead_ttfep_to_fpte(U const* __restrict__ idata, T* __restrict__ odata);
+
+
+template<typename T>
+class Unpacker
+{
+public:
+
+    Unpacker(cudaStream_t& stream,
+      std::size_t nsamples,
+      std::size_t nchannels,
+      std::size_t nelements,
+      std::string protocol);
+    ~Unpacker();
+    Unpacker(Unpacker const&) = delete;
+
+    void unpack(char* input, T* output);
+
+    void print_layout();
+
+    int sample_size(){return _sample_size;}
+private:
+    cudaStream_t& _stream;
+    int _sample_size = 8; // TODO: dynamic for spead and fixed for codif
+    std::string _protocol;
+    dim3 grid;
+    dim3 block;
+};
+
+} //namespace cryopaf
+} //namespace psrdada_cpp
+
+#include "psrdada_cpp/cryopaf/details/UnpackerKernels.cu"
+#include "psrdada_cpp/cryopaf/src/Unpacker.cu"
+
+#endif // UNPACKER_CUH
--- a/psrdada_cpp/cryopaf/details/BeamformerKernels.cu
+++ b/psrdada_cpp/cryopaf/details/BeamformerKernels.cu
--- a/psrdada_cpp/cryopaf/details/UnpackerKernels.cu
+++ b/psrdada_cpp/cryopaf/details/UnpackerKernels.cu
+#ifdef UNPACKER_CUH
+
+namespace psrdada_cpp {
+namespace cryopaf{
+
+
+__device__ __forceinline__ uint64_t swap64(uint64_t x)
+{
+    uint64_t result;
+    uint2 t;
+    asm("mov.b64 {%0,%1},%2; \n\t"
+        : "=r"(t.x), "=r"(t.y) : "l"(x));
+    t.x = __byte_perm(t.x, 0, 0x0123);
+    t.y = __byte_perm(t.y, 0, 0x0123);
+    asm("mov.b64 %0,{%1,%2}; \n\t"
+        : "=l"(result) : "r"(t.y), "r"(t.x));
+    return result;
+}
+
+
+template<typename T>__global__
+void unpack_codif_to_fpte(uint64_t const* __restrict__ idata, T* __restrict__ odata)
+{
+    int time = threadIdx.x + blockIdx.x * blockDim.x; // Time
+    int elem = threadIdx.y + blockIdx.y * blockDim.y; // Elements
+    int freq = threadIdx.z + blockIdx.z * blockDim.z; // Frequency
+    int chan = blockDim.z * gridDim.z;
+
+    int time_in = blockIdx.x * blockDim.x * gridDim.y * chan + threadIdx.x * chan;
+    int freq_in = freq;
+    int elem_in = elem * NSAMP_DF * chan ;
+
+    int freq_out = freq * NPOL_SAMP * gridDim.x * blockDim.x * gridDim.y;
+    int time_out = time * gridDim.y;
+
+    int in_idx = time_in + freq_in + elem_in;
+    int out_idx_x = freq_out + time_out + elem;
+    int out_idx_y = freq_out + gridDim.x * blockDim.x * gridDim.y + time_out + elem;
+
+    uint64_t tmp = swap64(idata[in_idx]);
+
+    odata[out_idx_x].x = static_cast<decltype(T::x)>((tmp & 0x000000000000ffffLL));
+    odata[out_idx_x].y = static_cast<decltype(T::y)>((tmp & 0x00000000ffff0000LL) >> 16);
+
+    odata[out_idx_y].x = static_cast<decltype(T::x)>((tmp & 0x0000ffff00000000LL) >> 32);
+    odata[out_idx_y].y = static_cast<decltype(T::y)>((tmp & 0xffff000000000000LL) >> 48);
+}
+
+template<typename U, typename T>__global__
+void unpack_spead_ttfep_to_fpte(U const* __restrict__ idata, T* __restrict__ odata)
+{
+    int time = threadIdx.x; // Time
+    int elem = blockIdx.y; // Elements
+    int freq = blockIdx.z; // Frequency
+    int heap_idx = blockIdx.x;
+
+    int in_idx = heap_idx * NSAMP_PER_HEAP * gridDim.z * gridDim.y * NPOL_SAMP // Outer time axis
+      + time * gridDim.z * gridDim.y * NPOL_SAMP // Inner time axis
+      + freq * gridDim.y * NPOL_SAMP // Frequency axis
+      + elem * NPOL_SAMP; // Element axis
+
+    int out_idx_x = freq * NPOL_SAMP * gridDim.x * NSAMP_PER_HEAP * gridDim.y // Frequency axis
+      + (time + blockIdx.x * blockDim.x) * gridDim.y
+      + elem;
+    int out_idx_y = freq * NPOL_SAMP * gridDim.x * NSAMP_PER_HEAP * gridDim.y // Frequency axis
+      + gridDim.x * NSAMP_PER_HEAP * gridDim.y
+      + (time + blockIdx.x * blockDim.x) * gridDim.y
+      + elem;
+
+    odata[out_idx_x].x = static_cast<decltype(T::x)>(idata[in_idx].x);
+    odata[out_idx_x].y = static_cast<decltype(T::y)>(idata[in_idx].y);
+
+    odata[out_idx_y].x = static_cast<decltype(T::x)>(idata[in_idx + 1].x);
+    odata[out_idx_y].y = static_cast<decltype(T::y)>(idata[in_idx + 1].y);
+}
+
+// ######################################################
+// NOTE: Kernels above are deprecated and not longer used
+// ######################################################
+/*
+template<typename T>__global__
+void unpack_codif_to_tfep(uint64_t const* __restrict__ idata, T* __restrict__ odata)
+{
+
+    int time = threadIdx.x + blockIdx.x * blockDim.x; // Time
+    int elem = threadIdx.y + blockIdx.y * blockDim.y; // Elements
+    int freq = threadIdx.z + blockIdx.z * blockDim.z; // Frequency
+    int chan = blockDim.z * gridDim.z;
+
+    int time_in = blockIdx.x * blockDim.x * gridDim.y * chan + threadIdx.x * chan;
+    int freq_in = freq;
+    int elem_in = elem * NSAMP_DF * chan ;
+
+    int time_out = time * chan * gridDim.y * NPOL_SAMP;
+    int freq_out = freq * gridDim.y * NPOL_SAMP;
+    int elem_out = elem * NPOL_SAMP;
+
+    int in_idx = time_in + freq_in + elem_in;
+    int out_idx = time_out + freq_out + elem_out;
+
+    uint64_t tmp = swap64(idata[in_idx]);
+
+    odata[out_idx].x = static_cast<decltype(T::x)>((tmp & 0x000000000000ffffLL));
+    odata[out_idx].y = static_cast<decltype(T::y)>((tmp & 0x00000000ffff0000LL) >> 16);
+
+    odata[out_idx + 1].x = static_cast<decltype(T::x)>((tmp & 0x0000ffff00000000LL) >> 32);
+    odata[out_idx + 1].y = static_cast<decltype(T::y)>((tmp & 0xffff000000000000LL) >> 48);
+}
+*/
+}
+}
+
+#endif
--- a/psrdada_cpp/cryopaf/details/utils.cu
+++ b/psrdada_cpp/cryopaf/details/utils.cu
+#ifndef UTILS_CU
+#define UTILS_CU
+
+/** UTILS **/
+__device__ __half2 __hCmul2(__half2 a, __half2 b)
+{
+		const __half r = a.x * b.x - a.y * b.y;
+		const __half i = a.x * b.y + a.y * b.x;
+
+		__half2 val; val.x = r; val.y = i;
+		return val;
+}
+
+template<typename T>
+__host__ __device__ T cmadd(T a, T b, T c)
+{
+	T val;
+	val.x = a.x * b.x - a.y * b.y + c.x;
+	val.y = a.x * b.y + a.y * b.x + c.y;
+	return val;
+}
+
+template<typename T>
+__host__ __device__ T cadd(T a, T b)
+{
+	T val;
+	val.x = a.x + b.x;
+	val.y = a.y + b.y;
+	return val;
+}
+
+template<typename T>
+__host__ __device__ T csub(T a, T b)
+{
+	T val;
+	val.x = a.x - b.x;
+	val.y = a.y - b.y;
+	return val;
+}
+
+template<typename T>
+__host__ __device__ double cabs(T a)
+{
+	return (double)sqrt((double)(a.x * a.x + a.y * a.y));
+}
+
+#endif
--- a/psrdada_cpp/cryopaf/profiling/CMakeLists.txt
+++ b/psrdada_cpp/cryopaf/profiling/CMakeLists.txt
+set(profiling_src
+    KernelStatistics.cuh
+    profiling.cu
+)
+cuda_add_executable(profiling ${profiling_src} )
+target_link_libraries(profiling ${PSRDADA_CPP_CRYOPAF_LIBRARIES})
--- a/psrdada_cpp/cryopaf/profiling/KernelStatistics.cuh
+++ b/psrdada_cpp/cryopaf/profiling/KernelStatistics.cuh
+#ifndef KERNEL_STATISTICS_CUH
+#define KERNEL_STATISTICS_CUH
+
+#include <fstream>
+#include <cuda.h>
+#include "psrdada_cpp/cuda_utils.hpp"
+
+
+struct ProfileConfig{
+   int device_id;
+   std::size_t n_samples;
+   std::size_t n_channel;
+   std::size_t n_elements;
+   std::size_t n_beam;
+   std::size_t integration;
+   std::string precision;
+   std::string protocol;
+   std::string out_dir;
+   const std::size_t n_pol = 2;
+ };
+
+class KernelProfiler
+{
+public:
+  KernelProfiler(ProfileConfig& config,
+    cudaStream_t& cuda_stream,
+    std::string kernel_name,
+    std::size_t complexity,
+    std::size_t read_size,
+    std::size_t write_size,
+    std::size_t input_size);
+  ~KernelProfiler();
+  void measure_start();
+  void measure_stop();
+  void update(float time_ms);
+  void finialize();
+  void export_to_csv(std::string filename);
+
+private:
+  ProfileConfig& conf;
+  cudaStream_t& stream;
+  cudaDeviceProp prop;
+
+  std::string name;
+  std::string head_line;
+
+  cudaEvent_t start, stop;
+  std::vector<float> elapsed_time;
+  std::vector<float> compute_tput;
+  std::vector<float> memory_bw;
+  std::vector<float> input_bw;
+  std::vector<float> output_bw;
+  float peak_mem_bandwidth;
+  float ms = 0;
+  float avg_time = 0;
+  float min_time = 0;
+  float max_time = 0;
+  float avg_tput = 0;
+  float min_tput = 0;
+  float max_tput = 0;
+  float avg_m_bw = 0;
+  float min_m_bw = 0;
+  float max_m_bw = 0;
+  float avg_m_bw_perc = 0;
+  float min_m_bw_perc = 0;
+  float max_m_bw_perc = 0;
+  float avg_i_bw = 0;
+  float min_i_bw = 0;
+  float max_i_bw = 0;
+  float avg_o_bw = 0;
+  float min_o_bw = 0;
+  float max_o_bw = 0;
+
+  std::size_t iterations = 0;
+  std::size_t reads;
+  std::size_t writes;
+  std::size_t input_sz;
+  std::size_t compute_complexity;
+};
+
+#include "psrdada_cpp/cryopaf/profiling/src/KernelStatistics.cu"
+
+#endif
--- a/psrdada_cpp/cryopaf/profiling/Results/2080/power_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/2080/power_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/Results/2080/unpacker_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/2080/unpacker_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/Results/2080/voltage_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/2080/voltage_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/Results/3090/power_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/3090/power_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/Results/3090/unpacker_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/3090/unpacker_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/Results/3090/voltage_kernel.csv
+++ b/psrdada_cpp/cryopaf/profiling/Results/3090/voltage_kernel.csv
--- a/psrdada_cpp/cryopaf/profiling/profiling.cu
+++ b/psrdada_cpp/cryopaf/profiling/profiling.cu
+#include <thrust/device_vector.h>
+#include <thrust/copy.h>
+#include <thrust/complex.h>
+#include <cuda.h>
+#include <random>
+#include <cmath>
+#include <fstream>
+#include <chrono>
+
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+
+#include "psrdada_cpp/cuda_utils.hpp"
+#include "psrdada_cpp/multilog.hpp"
+#include "psrdada_cpp/cryopaf/Unpacker.cuh"
+#include "psrdada_cpp/cryopaf/Beamformer.cuh"
+#include "psrdada_cpp/cryopaf/profiling/KernelStatistics.cuh"
+
+const size_t ERROR_IN_COMMAND_LINE = 1;
+const size_t SUCCESS = 0;
+
+using namespace psrdada_cpp;
+using namespace psrdada_cpp::cryopaf;
+using namespace std::chrono;
+
+template<typename T>
+void profile(ProfileConfig conf, std::size_t iter)
+{
+  // If template parameter is not of a complex dtype profiling has to be aborted
+  if(  !(std::is_same<T,  float2>::value)
+    && !(std::is_same<T, __half2>::value))
+  {
+    BOOST_LOG_TRIVIAL(error) << "ProfilingError: Template type not supported";
+    exit(1);
+  }
+  cudaStream_t stream;
+  CUDA_ERROR_CHECK(cudaStreamCreate(&stream));
+  // Instantiate processor objects
+  Beamformer<T> beamformer(stream,
+    conf.n_samples,
+    conf.n_channel,
+    conf.n_elements,
+    conf.n_beam,
+    conf.integration);
+
+  Unpacker<T> unpacker(stream,
+		conf.n_samples,
+		conf.n_channel,
+		conf.n_elements,
+		conf.protocol);
+
+  // Calulate memory size for input, weights and output
+  std::size_t input_size = conf.n_samples
+    * conf.n_elements
+    * conf.n_channel
+    * conf.n_pol;
+  std::size_t weight_size = conf.n_beam
+    * conf.n_elements
+    * conf.n_channel
+    * conf.n_pol;
+  std::size_t output_size = conf.n_samples
+    * conf.n_beam
+    * conf.n_channel;
+  std::size_t required_mem = input_size * sizeof(T)
+    + weight_size * sizeof(T)
+    + output_size * sizeof(decltype(T::x));
+  BOOST_LOG_TRIVIAL(debug) << "Required device memory: " << std::to_string(required_mem / (1024*1024)) << "MiB";
+
+  // Allocate device memory
+  thrust::device_vector<char> input_up(input_size * unpacker.sample_size(),0);
+  thrust::device_vector<T> input_bf(input_size, {.1, .1});
+  thrust::device_vector<T> weights_bf(weight_size, {.1, .1});
+  thrust::device_vector<T> output_bf_voltage(output_size * conf.n_pol);
+  thrust::device_vector<decltype(T::x)> output_bf_power(output_size / conf.integration, 0);
+
+  // Calculation of compute complexity
+  std::size_t n = conf.n_beam
+    * conf.n_channel
+    * conf.n_elements
+    * conf.n_pol
+    * conf.n_samples;
+  std::size_t complexity_bf_vol = 8 * n;
+  std::size_t complexity_bf_pow = 8 * n + 4 * n /** + accumulation **/;
+  std::size_t complexity_unpack = n / conf.n_beam;
+
+  // Create KernelStatistics object for each kernel
+  KernelProfiler voltage_profiler(conf, stream,
+    "Voltage",
+    complexity_bf_vol,
+    (input_bf.size() + weights_bf.size()) * sizeof(T),
+    output_bf_voltage.size() * sizeof(T),
+    input_bf.size() * sizeof(T));
+
+  KernelProfiler power_profiler(conf, stream,
+    "StokesI",
+    complexity_bf_pow,
+    (input_bf.size() + weights_bf.size()) * sizeof(T),
+    output_bf_power.size() * sizeof(decltype(T::x)),
+    input_bf.size() * sizeof(T));
+
+  KernelProfiler unpack_profiler(conf, stream,
+    "Unpacker",
+    complexity_unpack,
+    input_up.size() * sizeof(uint64_t),
+    input_bf.size() * sizeof(T),
+    input_up.size() * sizeof(uint64_t));
+
+  // Run all used kernels i-times
+  for(int i = 0; i < iter; i++)
+  {
+    // Call Stokes I detection beamformer kernel
+    power_profiler.measure_start();
+  	beamformer.process(
+      thrust::raw_pointer_cast(input_bf.data()),
+      thrust::raw_pointer_cast(weights_bf.data()),
+      thrust::raw_pointer_cast(output_bf_power.data()));
+    power_profiler.measure_stop();
+    // Call to voltage beamformer kernel
+    voltage_profiler.measure_start();
+  	beamformer.process(
+      thrust::raw_pointer_cast(input_bf.data()),
+      thrust::raw_pointer_cast(weights_bf.data()),
+      thrust::raw_pointer_cast(output_bf_voltage.data()));
+    voltage_profiler.measure_stop();
+    // Call to unpacking kernel
+    unpack_profiler.measure_start();
+    unpacker.unpack(
+      thrust::raw_pointer_cast(input_up.data()),
+      thrust::raw_pointer_cast(input_bf.data()));
+    unpack_profiler.measure_stop();
+  }
+  CUDA_ERROR_CHECK(cudaStreamDestroy(stream));
+
+  power_profiler.finialize();
+  voltage_profiler.finialize();
+  unpack_profiler.finialize();
+  power_profiler.export_to_csv(conf.out_dir + "power_kernel.csv");
+  voltage_profiler.export_to_csv(conf.out_dir + "voltage_kernel.csv");
+  unpack_profiler.export_to_csv(conf.out_dir + "unpacker_kernel.csv");
+}
+
+
+int main(int argc, char** argv)
+{
+  // Variables to store command line options
+  ProfileConfig conf;
+  int iter;
+  std::string precision;
+  std::string filename;
+
+  // Parse command line
+  namespace po = boost::program_options;
+  po::options_description desc("Options");
+  desc.add_options()
+
+  ("help,h", "Print help messages")
+  ("samples", po::value<std::size_t>(&conf.n_samples)->default_value(1024), "Number of samples within one batch")
+  ("channels", po::value<std::size_t>(&conf.n_channel)->default_value(14), "Number of channels")
+  ("elements", po::value<std::size_t>(&conf.n_elements)->default_value(WARP_SIZE*4), "Number of elements")
+  ("beams", po::value<std::size_t>(&conf.n_beam)->default_value(64), "Number of beams")
+  ("integration", po::value<std::size_t>(&conf.integration)->default_value(1), "Integration interval; must be multiple 2^n and smaller 32")
+  ("device", po::value<int>(&conf.device_id)->default_value(0), "ID of GPU device")
+  ("protocol", po::value<std::string>(&conf.protocol)->default_value("codif"), "Protocol of input data; supported protocol 'codif'")
+  ("iteration", po::value<int>(&iter)->default_value(5), "Iterations to run")
+  ("precision", po::value<std::string>(&conf.precision)->default_value("half"), "Compute type of GEMM operation; supported precisions 'half' and 'single'")
+  ("outdir", po::value<std::string>(&conf.out_dir)->default_value("Results/"), "Output directory to store csv files");
+
+  po::variables_map vm;
+  try
+  {
+      po::store(po::parse_command_line(argc, argv, desc), vm);
+      if ( vm.count("help")  )
+      {
+          std::cout << "Beamform Profiling" << std::endl
+          << desc << std::endl;
+          return SUCCESS;
+      }
+      po::notify(vm);
+  }
+  catch(po::error& e)
+  {
+      std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+      std::cerr << desc << std::endl;
+      return ERROR_IN_COMMAND_LINE;
+  }
+
+  if(conf.precision == "half")
+  {
+    profile<__half2>(conf, iter);
+  }
+  else if(conf.precision == "single")
+  {
+    profile<float2>(conf, iter);
+  }
+  else
+  {
+    BOOST_LOG_TRIVIAL(error) << "Compute type " << precision << " not implemented";
+  }
+
+  return 0;
+}