diff --git a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
index 8f2a161ee842219ab8b648a068466c22e0e0033f..5a4ffcabc3653481376a81d4a0ce9e3d4d44b863 100644
--- a/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDPolnMerge.cpp
@@ -9,28 +9,32 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
 
-uint64_t interleave(uint32_t x, uint32_t y) {
-    __m128i xvec = _mm_cvtsi32_si128(x);
-    __m128i yvec = _mm_cvtsi32_si128(y);
-    __m128i interleaved = _mm_unpacklo_epi8(yvec, xvec);
-    return _mm_cvtsi128_si64(interleaved);
-}
-
 void merge2pol(char const *buf, char *out)
 {
-    uint8_t *qword0 = (uint8_t*)(buf);
-    uint8_t *qword1 = (uint8_t*)(buf) + HEAP_SIZE;
+    uint8_t const* qword0 = reinterpret_cast<uint8_t const*>(buf);
+    uint8_t const* qword1 = reinterpret_cast<uint8_t const*>(buf) + HEAP_SIZE;
     uint64_t* D = reinterpret_cast<uint64_t*>(out);
-    for (size_t i = 0; i < HEAP_SIZE / sizeof(uint32_t); i++)
+
+    __m128i xvec, yvec, interleaved;
+    for (size_t i = 0; i < HEAP_SIZE / sizeof(uint32_t); i += 8)
     {
-        uint32_t* S0 = reinterpret_cast<uint32_t*>(qword0);
-        uint32_t* S1 = reinterpret_cast<uint32_t*>(qword1);
-        *D++ = interleave(*S1++, *S0++);
-        qword0 += sizeof(uint32_t);
-        qword1 += sizeof(uint32_t);
+        xvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword0));
+        yvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword1));
+        interleaved = _mm_unpacklo_epi8(yvec, xvec);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(D), interleaved);
+
+        xvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword0 + 16));
+        yvec = _mm_loadu_si128(reinterpret_cast<__m128i const*>(qword1 + 16));
+        interleaved = _mm_unpacklo_epi8(yvec, xvec);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(D + 2), interleaved);
+
+        qword0 += 32;
+        qword1 += 32;
+        D += 4;
     }
 }
 
+
 EDDPolnMerge::EDDPolnMerge(std::size_t npol, int nthreads, DadaWriteClient& writer)
     : _npol(npol)
     , _nthreads(nthreads)
diff --git a/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp b/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp
index bca35d941cd7510cc4de67a49c8e825f41c596f0..bb8db0b75731bdd97b24df30f4626938db98fecb 100644
--- a/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp
+++ b/psrdada_cpp/effelsberg/edd/src/EDDRoach_merge_leap.cpp
@@ -9,13 +9,6 @@ namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
 
-uint64_t interleave(uint32_t x, uint32_t y) {
-	__m128i xvec = _mm_cvtsi32_si128(x);
-	__m128i yvec = _mm_cvtsi32_si128(y);
-	__m128i interleaved = _mm_unpacklo_epi8(yvec, xvec);
-	return _mm_cvtsi128_si64(interleaved);
-}
-
 EDDRoach_merge_leap::EDDRoach_merge_leap(std::size_t nchunck, int nthreads, int heap_size, DadaWriteClient& writer)
 	: _nchunck(nchunck)
 	, _heap_size(heap_size)
@@ -28,7 +21,7 @@ EDDRoach_merge_leap::~EDDRoach_merge_leap()
 {
 }
 
-void EDDRoach_merge_leap::init(RawBytes& block)
+void EDDRoach_merge_leap::init(RawBytes & block)
 {
 	RawBytes& oblock = _writer.header_stream().next();
 	if (block.used_bytes() > oblock.total_bytes())
@@ -37,47 +30,48 @@ void EDDRoach_merge_leap::init(RawBytes& block)
 		throw std::runtime_error("Output DADA buffer does not have enough space for header");
 	}
 	std::memcpy(oblock.ptr(), block.ptr(), block.used_bytes());
-    char buffer[1024];
-    ascii_header_get(block.ptr(), "SAMPLE_CLOCK_START", "%s", buffer);
-    std::size_t sample_clock_start = std::strtoul(buffer, NULL, 0);
-    ascii_header_get(block.ptr(), "CLOCK_SAMPLE", "%s", buffer);
-    long double sample_clock = std::strtold(buffer, NULL);
-    ascii_header_get(block.ptr(), "SYNC_TIME", "%s", buffer);
-    long double sync_time = std::strtold(buffer, NULL);
-    long double unix_time = sync_time + (sample_clock_start / sample_clock);
-    long double mjd_time = (unix_time / 86400.0 ) + 40587;
-    std::ostringstream mjd_start;
-    mjd_start << std::fixed;
-    mjd_start << std::setprecision(12);
-    mjd_start << mjd_time;
-    ascii_header_set(oblock.ptr(), "MJD_START", "%s", mjd_start.str().c_str());
-    ascii_header_set(oblock.ptr(), "UNIX_TIME", "%Lf", unix_time);
+	char buffer[1024];
+	ascii_header_get(block.ptr(), "SAMPLE_CLOCK_START", "%s", buffer);
+	std::size_t sample_clock_start = std::strtoul(buffer, NULL, 0);
+	ascii_header_get(block.ptr(), "CLOCK_SAMPLE", "%s", buffer);
+	long double sample_clock = std::strtold(buffer, NULL);
+	ascii_header_get(block.ptr(), "SYNC_TIME", "%s", buffer);
+	long double sync_time = std::strtold(buffer, NULL);
+	long double unix_time = sync_time + (sample_clock_start / sample_clock);
+	long double mjd_time = (unix_time / 86400.0 ) + 40587;
+	std::ostringstream mjd_start;
+	mjd_start << std::fixed;
+	mjd_start << std::setprecision(12);
+	mjd_start << mjd_time;
+	ascii_header_set(oblock.ptr(), "MJD_START", "%s", mjd_start.str().c_str());
+	ascii_header_set(oblock.ptr(), "UNIX_TIME", "%Lf", unix_time);
 	oblock.used_bytes(oblock.total_bytes());
 	_writer.header_stream().release();
 }
-
-bool EDDRoach_merge_leap::operator()(RawBytes& block)
+bool EDDRoach_merge_leap::operator()(RawBytes & block)
 {
 	BOOST_LOG_TRIVIAL(info) << "nchucnk " << _nchunck << "\n";
 	RawBytes& oblock = _writer.data_stream().next();
-	std::size_t bytes_per_chunk = 4;
-	std::size_t heap_group = _heap_size * _nchunck;
+	const std::size_t bytes_per_chunk = 4;
+	const std::size_t heap_group = _heap_size * _nchunck;
+	const std::size_t num_chunks = block.used_bytes() / heap_group;
 	#pragma omp parallel for num_threads(_nthreads)
-	for (std::size_t xx = 0; xx < block.used_bytes() / heap_group; xx++)
+	for (std::size_t xx = 0; xx < num_chunks; ++xx)
 	{
-		std::vector<char*> ptrs(_nchunck);
+		std::vector<const char*> chunk_ptrs(_nchunck);
 		for (std::size_t ii = 0; ii < _nchunck; ++ii)
 		{
-			ptrs[ii] = block.ptr() + xx * heap_group + ii * heap_group / _nchunck;
+			const std::size_t offset = xx * heap_group + ii * heap_group / _nchunck;
+			chunk_ptrs[ii] = block.ptr() + offset;
 		}
 		const char *target = oblock.ptr() + xx * heap_group;
 
-		for (std::size_t yy = 0; yy < heap_group / _nchunck / bytes_per_chunk; yy++)
+		for (std::size_t yy = 0; yy < heap_group / _nchunck / bytes_per_chunk; ++yy)
 		{
 			for (std::size_t ii = 0; ii < _nchunck; ++ii)
 			{
-				std::memcpy((void*)target, (void*)ptrs[ii], bytes_per_chunk);
-				ptrs[ii] += bytes_per_chunk;
+				std::memcpy((void*)target, chunk_ptrs[ii], bytes_per_chunk);
+				chunk_ptrs[ii] += bytes_per_chunk;
 				target += bytes_per_chunk;
 			}
 		}
@@ -86,6 +80,7 @@ bool EDDRoach_merge_leap::operator()(RawBytes& block)
 	_writer.data_stream().release();
 	return false;
 }
+
 }//edd
 }//effelsberg
 }//psrdada_cpp