multithread pol merge and 8 bit unpacker update

6ced6c36 · Jason Wu · 8617011d · 6ced6c36 · 6ced6c36 · 6ced6c36
Commit 6ced6c36 authored Jan 26, 2021 by Jason Wu
--- a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
@@ -16,9 +16,25 @@ namespace edd {
 	  return _mm_cvtsi128_si64(interleaved);
 	}

-    EDDPolnMerge::EDDPolnMerge(std::size_t nsamps_per_heap, std::size_t npol, DadaWriteClient& writer)
+void merge2pol(char const *buf, char *out)
+  {
+  uint8_t *qword0 = (uint8_t*)(buf);
+  uint8_t *qword1 = (uint8_t*)(buf)+4096;
+  uint64_t* D = reinterpret_cast<uint64_t*>(out);
+  for(int i=0;i<4096/4;i++)
+    {
+    	uint32_t* S0 = reinterpret_cast<uint32_t*>(qword0);
+    	uint32_t* S1 = reinterpret_cast<uint32_t*>(qword1);
+        *D++ = interleave(*S1++, *S0++);
+	qword0 +=4;
+	qword1 +=4;
+   }
+  }
+
+    EDDPolnMerge::EDDPolnMerge(std::size_t nsamps_per_heap, std::size_t npol, std::size_t nthreads, DadaWriteClient& writer)
    : _nsamps_per_heap(nsamps_per_heap)
    , _npol(npol)
+    ,_nthreads(nthreads)
    , _writer(writer)
    {
    }
@@ -43,10 +59,10 @@ namespace edd {
 	long double sample_clock = std::strtold(buffer, NULL);
 	ascii_header_get(block.ptr(), "SYNC_TIME", "%s", buffer);
 	long double sync_time = std::strtold(buffer, NULL);
-	    BOOST_LOG_TRIVIAL(debug) << "Sample_clock_start "<< sample_clock_start;
-	    BOOST_LOG_TRIVIAL(debug)<<  "Sample_clock "<< sample_clock;
-	    BOOST_LOG_TRIVIAL(debug) << "Sync_time "<< sync_time;
-		BOOST_LOG_TRIVIAL(debug) << "Sample_clock_start / sample_clock "<< sample_clock_start / sample_clock;
+        BOOST_LOG_TRIVIAL(debug) << "this is sample_clock_start "<< sample_clock_start;
+        BOOST_LOG_TRIVIAL(debug)<< "this is sample_clock "<< sample_clock;
+        BOOST_LOG_TRIVIAL(debug) << "this is sync_time "<< sync_time;
+	BOOST_LOG_TRIVIAL(debug) << "this is sample_clock_start / sample_clock "<< sample_clock_start / sample_clock;
 	long double unix_time = sync_time + (sample_clock_start / sample_clock);
 	long double mjd_time = unix_time / 86400 - 40587.5;
 	char time_buffer[80];
@@ -60,7 +76,11 @@ namespace edd {
 	std::stringstream utc_time_stamp;
 	BOOST_LOG_TRIVIAL(debug) << "unix_time" << unix_time;
 	BOOST_LOG_TRIVIAL(debug) << "fractional part " << fractpart;
+        //BOOST_LOG_TRIVIAL(debug) << "fractional part ." << static_cast<std::size_t>(fractpart*10000000000);
+        //utc_time_stamp<< time_buffer << "." <<fractpart;
 	utc_time_stamp<< time_buffer << "." << std::setw(10) << std::setfill('0') << std::size_t(fractpart*10000000000) << std::setfill(' ');
+	//BOOST_LOG_TRIVIAL(debug) << "fractional part" <<static_cast<std::size_t>(fractpart * 10000000000);
+	//utc_time_stamp<< time_buffer << "." << static_cast<std::size_t>(fractpart * 10000000000);
 	BOOST_LOG_TRIVIAL(debug) << "this is start time in utc "<< utc_time_stamp.str().c_str()<< "\n";
 //	std::cout << "this is sync_time MJD "<< mjd_time<< "\n";
 	ascii_header_set(oblock.ptr(), "UTC_START", "%s", utc_time_stamp.str().c_str());
@@ -72,6 +92,15 @@ namespace edd {
    bool EDDPolnMerge::operator()(RawBytes& block)
    {
 	std:size_t nheap_groups = block.used_bytes()/_npol/_nsamps_per_heap;
+/**
+        if (block.used_bytes() < block.total_bytes())
+        {
+            BOOST_LOG_TRIVIAL (debug) << "Reach end of data";
+            _writer.data_stream().next();
+            _writer.data_stream().release();
+            return true;
+        }
+**/
 	RawBytes& oblock = _writer.data_stream().next();

        if (block.used_bytes() > oblock.total_bytes())
@@ -79,20 +108,13 @@ namespace edd {
 	    _writer.data_stream().release();
 	    throw std::runtime_error("Output DADA buffer does not match with the input dada buffer");
        }
-		uint32_t* S0 = reinterpret_cast<uint32_t*>(block.ptr());
-        uint32_t* S1 = reinterpret_cast<uint32_t*>(block.ptr() + _nsamps_per_heap);
-		uint64_t* D = reinterpret_cast<uint64_t*>(oblock.ptr());

-	for (std::size_t jj = 0; jj < nheap_groups; ++jj)
-		{
-		for (std::size_t ii = 0; ii < _nsamps_per_heap/sizeof(uint32_t); ++ii)
+	#pragma omp parallel for schedule(dynamic, _nthreads) num_threads(_nthreads)
+    	for (std::size_t kk = 0; kk < block.used_bytes()/_nsamps_per_heap/_npol ; ++kk)
        {
-			*D++ = interleave(*S1++, *S0++);
-			}
-			S0 += _nsamps_per_heap/sizeof(uint32_t);
-			S1 += _nsamps_per_heap/sizeof(uint32_t);
+        char *buffer = block.ptr() + _nsamps_per_heap * _npol * kk;
+        merge2pol(buffer, oblock.ptr() + kk * _npol * _nsamps_per_heap);
        }
-
 	oblock.used_bytes(block.used_bytes());
 	_writer.data_stream().release();
        return false;


--- a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge10to8.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge10to8.cpp
@@ -16,75 +16,91 @@ namespace edd {
      return _mm_cvtsi128_si64(interleaved);
    }
    
-void handle_packet_numbers_4096x10_s(char const *buf, char *out)
-  { // Print 4096 numbers of 10 bit signed integers.
-  int i;
-  int16_t  adc;
-  uint64_t val,rest;
-  uint64_t *qword;
-  qword = (uint64_t*)(buf);
-  for(i=0;i<640/5;i++)
+uint64_t *unpack5(uint64_t *qword, uint8_t *out)
 {
-    // 1st:
+    uint64_t val,rest;
    val = be64toh(*qword);
    //printf("0x%016lX\n",val);
    qword++;
-    out[i] = ((int64_t)(( 0xFFC0000000000000 & val) <<  0) >> 54) & 0x3FC; 
-    out[i+1] = ((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54) & 0x3FC; 
-    out[i+2] = ((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54) & 0x3FC; 
-    out[i+3] = ((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54) & 0x3FC; 
-    out[i+4] = ((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54) & 0x3FC; 
-    out[i+5] = ((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54) & 0x3FC; 
+    out[0] = ((int64_t)(( 0xFFC0000000000000 & val) <<  0) >> 54) & 0xFF;
+    out[1] = ((int64_t)(( 0x003FF00000000000 & val) << 10) >> 54) & 0xFF;
+    out[2] = ((int64_t)(( 0x00000FFC00000000 & val) << 20) >> 54) & 0xFF;
+    out[3] = ((int64_t)(( 0x00000003FF000000 & val) << 30) >> 54) & 0xFF;
+    out[4] = ((int64_t)(( 0x0000000000FFC000 & val) << 40) >> 54) & 0xFF;
+    out[5] = ((int64_t)(( 0x0000000000003FF0 & val) << 50) >> 54) & 0xFF;
    rest =          ( 0x000000000000000F & val) << 60; // 4 bits rest.
    // 2nd:
    val = be64toh(*qword);
    //printf("0x%016lX\n",val);
    qword++;
-    out[i+6] = ((int64_t)(((0xFC00000000000000 & val) >> 4) | rest) >> 54) & 0x3FC; 
-    out[i+7] = ((int64_t)(( 0x03FF000000000000 & val) <<  6) >> 54) & 0x3FC; 
-    out[i+8] = ((int64_t)(( 0x0000FFC000000000 & val) << 16) >> 54) & 0x3FC; 
-    out[i+9] = ((int64_t)(( 0x0000003FF0000000 & val) << 26) >> 54) & 0x3FC; 
-    out[i+10] =((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54) & 0x3FC; 
-    out[i+11] = ((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54) & 0x3FC; 
+    out[6] = ((int64_t)(((0xFC00000000000000 & val) >> 4) | rest) >> 54) & 0xFF;
+    out[7] = ((int64_t)(( 0x03FF000000000000 & val) <<  6) >> 54) & 0xFF;
+    out[8] = ((int64_t)(( 0x0000FFC000000000 & val) << 16) >> 54) & 0xFF;
+    out[9] = ((int64_t)(( 0x0000003FF0000000 & val) << 26) >> 54) & 0xFF;
+    out[10] =((int64_t)(( 0x000000000FFC0000 & val) << 36) >> 54) & 0xFF;
+    out[11] = ((int64_t)(( 0x000000000003FF00 & val) << 46) >> 54) & 0xFF;
    rest =          ( 0x00000000000000FF & val) << 56; // 8 bits rest.
    // 3rd:
    val = be64toh(*qword);
    //printf("0x%016lX\n",val);
    qword++;
-    out[i+12] = ((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54) & 0x3FC; 
-    out[i+13] = ((int64_t)(( 0x3FF0000000000000 & val) <<  2) >> 54) & 0x3FC; 
-    out[i+14] = ((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54) & 0x3FC; 
-    out[i+15] = ((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54) & 0x3FC; 
-    out[i+16] = ((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54) & 0x3FC; 
-    out[i+17] = ((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54) & 0x3FC; 
-    out[i+18] = ((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54) & 0x3FC; 
+    out[12] = ((int64_t)(((0xC000000000000000 & val) >> 8) | rest) >> 54) & 0xFF;
+    out[13] = ((int64_t)(( 0x3FF0000000000000 & val) <<  2) >> 54) & 0xFF;
+    out[14] = ((int64_t)(( 0x000FFC0000000000 & val) << 12) >> 54) & 0xFF;
+    out[15] = ((int64_t)(( 0x000003FF00000000 & val) << 22) >> 54) & 0xFF;
+    out[16] = ((int64_t)(( 0x00000000FFC00000 & val) << 32) >> 54) & 0xFF;
+    out[17] = ((int64_t)(( 0x00000000003FF000 & val) << 42) >> 54) & 0xFF;
+    out[18] = ((int64_t)(( 0x0000000000000FFC & val) << 52) >> 54) & 0xFF;
    rest =          ( 0x0000000000000003 & val) << 62; // 2 bits rest.
    // 4th:
    val = be64toh(*qword);
    //printf("0x%016lX\n",val);
    qword++;
-    out[i+19] = ((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54) & 0x3FC; 
-    out[i+20] = ((int64_t)(( 0x00FFC00000000000 & val) <<  8) >> 54) & 0x3FC; 
-    out[i+21] = ((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54) & 0x3FC; 
-    out[i+22] = ((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54) & 0x3FC; 
-    out[i+23] = ((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54) & 0x3FC; 
-    out[i+24] = ((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54) & 0x3FC; 
+    out[19] = ((int64_t)(((0xFF00000000000000 & val) >> 2) | rest) >> 54) & 0xFF;
+    out[20] = ((int64_t)(( 0x00FFC00000000000 & val) <<  8) >> 54) & 0xFF;
+    out[21] = ((int64_t)(( 0x00003FF000000000 & val) << 18) >> 54) & 0xFF;
+    out[22] = ((int64_t)(( 0x0000000FFC000000 & val) << 28) >> 54) & 0xFF;
+    out[23] = ((int64_t)(( 0x0000000003FF0000 & val) << 38) >> 54) & 0xFF;
+    out[24] = ((int64_t)(( 0x000000000000FFC0 & val) << 48) >> 54) & 0xFF;
    rest =          ( 0x000000000000003F & val) << 58; // 6 bits rest.
    // 5th:
    val = be64toh(*qword);
    //printf("0x%016lX\n",val);
    qword++;
-    out[i+25] = ((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54) & 0x3FC; 
-    out[i+26] = ((int64_t)(( 0x0FFC000000000000 & val) <<  4) >> 54) & 0x3FC; 
-    out[i+27] = ((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54) & 0x3FC; 
-    out[i+28] = ((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54) & 0x3FC; 
-    out[i+29] = ((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54) & 0x3FC; 
-    out[i+30] = ((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54) & 0x3FC; 
-    out[i+31] = ((int64_t)(( 0x00000000000003FF & val) << 54) >> 54) & 0x3FC; 
+    out[25] = ((int64_t)(((0xF000000000000000 & val) >> 6) | rest) >> 54) & 0xFF;
+    out[26] = ((int64_t)(( 0x0FFC000000000000 & val) <<  4) >> 54) & 0xFF;
+    out[27] = ((int64_t)(( 0x0003FF0000000000 & val) << 14) >> 54) & 0xFF;
+    out[28] = ((int64_t)(( 0x000000FFC0000000 & val) << 24) >> 54) & 0xFF;
+    out[29] = ((int64_t)(( 0x000000003FF00000 & val) << 34) >> 54) & 0xFF;
+    out[30] = ((int64_t)(( 0x00000000000FFC00 & val) << 44) >> 54) & 0xFF;
+    out[31] = ((int64_t)(( 0x00000000000003FF & val) << 54) >> 54) & 0xFF;
    rest = 0; // No rest.
+    return qword;
 }	
+
+void handle_packet_numbers_4096x10_s(char const *buf, char *out)
+  { // Print 4096 numbers of 10 bit signed integers.
+
+  uint64_t val,rest;
+  uint8_t S0_8bit[32];
+  uint8_t S1_8bit[32];
+  uint64_t *qword0 = (uint64_t*)(buf);
+  uint64_t *qword1 = (uint64_t*)(buf)+640;	
+  uint64_t* D = reinterpret_cast<uint64_t*>(out);
+  for(int i=0;i<640/5;i++)
+    {
+	qword0 =unpack5(qword0, S0_8bit);
+	qword1 =unpack5(qword1, S1_8bit);
+    
+    	uint32_t* S0 = reinterpret_cast<uint32_t*>(S0_8bit);
+    	uint32_t* S1 = reinterpret_cast<uint32_t*>(S1_8bit);
+        for (std::size_t ii = 0; ii < 8; ++ii)
+            {
+            *D++ = interleave(*S1++, *S0++);
            }
   
+   }
+  }
   EDDPolnMerge10to8::EDDPolnMerge10to8(std::size_t nsamps_per_heap, std::size_t npol, DadaWriteClient& writer)
    : _nsamps_per_heap(nsamps_per_heap)
    , _npol(npol)
@@ -131,6 +147,7 @@ void handle_packet_numbers_4096x10_s(char const *buf, char *out)
        BOOST_LOG_TRIVIAL(debug) << "fractional part " << fractpart;
        utc_time_stamp<< time_buffer << "." << std::setw(10) << std::setfill('0') << std::size_t(fractpart*10000000000) << std::setfill(' ');
        BOOST_LOG_TRIVIAL(debug) << "this is start time in utc "<< utc_time_stamp.str().c_str()<< "\n";
+        //  std::cout << "this is sync_time MJD "<< mjd_time<< "\n";
        ascii_header_set(oblock.ptr(), "UTC_START", "%s", utc_time_stamp.str().c_str());
        ascii_header_set(oblock.ptr(), "UNIX_TIME", "%Lf", unix_time);
        oblock.used_bytes(oblock.total_bytes());
@@ -142,23 +159,26 @@ void handle_packet_numbers_4096x10_s(char const *buf, char *out)
    {
 	std::cout << "Beginning of the operator"<< std::endl;
 	std:size_t nheap_groups = 0.8*block.used_bytes()/_npol/_nsamps_per_heap;
-	
        RawBytes& oblock = _writer.data_stream().next();

+//    if (block.used_bytes() > oblock.total_bytes())
+//        {
+//        _writer.data_stream().release();
+//        throw std::runtime_error("Output DADA buffer does not match with the input dada buffer");
+//         }
+
+        /* convert 10 bit to 8 bit data here */
    BOOST_LOG_TRIVIAL(debug) << "block.used_bytes() = "<<block.used_bytes();
    BOOST_LOG_TRIVIAL(debug) << "Entering unpack loop";
    
    #pragma omp parallel for schedule(dynamic, 4) num_threads(4)
    for (std::size_t kk = 0; kk < block.used_bytes()/ 5120/2 ; ++kk)
        {
-		
-		char *buffer_pol0 = block.ptr() + 5120 * kk * 2;
-        handle_packet_numbers_4096x10_s(buffer_pol0, oblock.ptr() + kk* 2 * 4096);
-		char *buffer_pol1 = block.ptr() + (5120 * kk * 2)+ 5120;
-        handle_packet_numbers_4096x10_s(buffer_pol1, oblock.ptr() + kk* 2 * 4096 + 4096);
-
+        char *buffer = block.ptr() + 5120 * 2 * kk;
+        handle_packet_numbers_4096x10_s(buffer, oblock.ptr() + kk * 8192);
        }
    oblock.used_bytes(block.used_bytes()*0.8);
+    //oblock.used_bytes(block.used_bytes());
    _writer.data_stream().release();
        return false;
    }


--- a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge_cli.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge_cli.cpp
@@ -25,7 +25,7 @@ int main(int argc, char** argv)
    	key_t output_key;
    	std::size_t npol;
    	std::size_t nsamps_per_heap;
-
+        std::size_t nthreads;
        /** Define and parse the program options
        */
        namespace po = boost::program_options;
@@ -52,6 +52,9 @@ int main(int argc, char** argv)
        ("npol,p", po::value<std::size_t>(&npol)->default_value(2),
            "Value of number of pol")

+        ("nthreads,n", po::value<std::size_t>(&nthreads)->default_value(2),
+            "Value of number of threads")
+
        ("nsamps_per_heap,n", po::value<std::size_t>(&nsamps_per_heap)->default_value(4096),
            "Value of samples per heap")

@@ -86,7 +89,7 @@ int main(int argc, char** argv)
         */
        MultiLog log("edd::EDDPolnMerge");
 	    DadaWriteClient output(output_key, log);
-	effelsberg::edd::EDDPolnMerge merger(nsamps_per_heap, npol, output);
+	    effelsberg::edd::EDDPolnMerge merger(nsamps_per_heap, npol, nthreads, output);
        DadaInputStream <decltype(merger)> input(input_key, log, merger);
        input.start();
        /**