refactor to improve performance

a346222c · Jason Wu · 42c0808e · a346222c · a346222c
Commit a346222c authored 2 years ago by Jason Wu
--- a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
@@ -9,28 +9,32 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
-uint64_t interleave(uint32_t x, uint32_t y) {
-    __m128i xvec = _mm_cvtsi32_si128(x);
-    __m128i yvec = _mm_cvtsi32_si128(y);
-    __m128i interleaved = _mm_unpacklo_epi8(yvec, xvec);
-    return _mm_cvtsi128_si64(interleaved);
-}
 void merge2pol(char const *buf, char *out)
 {
-    uint8_t *qword0 = (uint8_t*)(buf);
+    uint8_t const* qword0 = reinterpret_cast<uint8_t const*>(buf);
-    uint8_t *qword1 = (uint8_t*)(buf) + HEAP_SIZE;
+    uint8_t const* qword1 = reinterpret_cast<uint8_t const*>(buf) + HEAP_SIZE;
    uint64_t* D = reinterpret_cast<uint64_t*>(out);
-    for (size_t i = 0; i < HEAP_SIZE / sizeof(uint32_t); i++)
+    __m128i xvec, yvec, interleaved;
+    for (size_t i = 0; i < HEAP_SIZE / sizeof(uint32_t); i += 8)
    {
-        uint32_t* S0 = reinterpret_cast<uint32_t*>(qword0);
+        xvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword0));
-        uint32_t* S1 = reinterpret_cast<uint32_t*>(qword1);
+        yvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword1));
-        *D++ = interleave(*S1++, *S0++);
+        interleaved = _mm_unpacklo_epi8(yvec, xvec);
-        qword0 += sizeof(uint32_t);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(D), interleaved);
-        qword1 += sizeof(uint32_t);
+        xvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword0 + 16));
+        yvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword1 + 16));
+        interleaved = _mm_unpacklo_epi8(yvec, xvec);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(D + 2), interleaved);
+        qword0 += 32;
+        qword1 += 32;
+        D += 4;
    }
 }
 EDDPolnMerge::EDDPolnMerge(std::size_t npol, int nthreads, DadaWriteClient& writer)
    : _npol(npol)
    , _nthreads(nthreads)

--- a/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp
@@ -9,13 +9,6 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
-uint64_t interleave(uint32_t x, uint32_t y) {
-	__m128i xvec = _mm_cvtsi32_si128(x);
-	__m128i yvec = _mm_cvtsi32_si128(y);
-	__m128i interleaved = _mm_unpacklo_epi8(yvec, xvec);
-	return _mm_cvtsi128_si64(interleaved);
-}
 EDDRoach_merge_leap::EDDRoach_merge_leap(std::size_t nchunck, int nthreads, int heap_size, DadaWriteClient& writer)
 	: _nchunck(nchunck)
 	, _heap_size(heap_size)
@@ -55,29 +48,30 @@ void EDDRoach_merge_leap::init(RawBytes& block)
 	oblock.used_bytes(oblock.total_bytes());
 	_writer.header_stream().release();
 }
 bool EDDRoach_merge_leap::operator()(RawBytes & block)
 {
 	BOOST_LOG_TRIVIAL(info) << "nchucnk " << _nchunck << "\n";
 	RawBytes& oblock = _writer.data_stream().next();
-	std::size_t bytes_per_chunk = 4;
+	const std::size_t bytes_per_chunk = 4;
-	std::size_t heap_group = _heap_size * _nchunck;
+	const std::size_t heap_group = _heap_size * _nchunck;
+	const std::size_t num_chunks = block.used_bytes() / heap_group;
 	#pragma omp parallel for num_threads(_nthreads)
-	for (std::size_t xx = 0; xx < block.used_bytes() / heap_group; xx++)
+	for (std::size_t xx = 0; xx < num_chunks; ++xx)
 	{
-		std::vector<char*> ptrs(_nchunck);
+		std::vector<const char*> chunk_ptrs(_nchunck);
 		for (std::size_t ii = 0; ii < _nchunck; ++ii)
 		{
-			ptrs[ii] = block.ptr() + xx * heap_group + ii * heap_group / _nchunck;
+			const std::size_t offset = xx * heap_group + ii * heap_group / _nchunck;
+			chunk_ptrs[ii] = block.ptr() + offset;
 		}
 		const char *target = oblock.ptr() + xx * heap_group;
-		for (std::size_t yy = 0; yy < heap_group / _nchunck / bytes_per_chunk; yy++)
+		for (std::size_t yy = 0; yy < heap_group / _nchunck / bytes_per_chunk; ++yy)
 		{
 			for (std::size_t ii = 0; ii < _nchunck; ++ii)
 			{
-				std::memcpy((void*)target, (void*)ptrs[ii], bytes_per_chunk);
+				std::memcpy((void*)target, chunk_ptrs[ii], bytes_per_chunk);
-				ptrs[ii] += bytes_per_chunk;
+				chunk_ptrs[ii] += bytes_per_chunk;
 				target += bytes_per_chunk;
 			}
 		}
@@ -86,6 +80,7 @@ bool EDDRoach_merge_leap::operator()(RawBytes& block)
 	_writer.data_stream().release();
 	return false;
 }
 }//edd
 }//effelsberg
 }//psrdada_cpp