diff --git a/psrdada_cpp/common/src/unpacker.cu b/psrdada_cpp/common/src/unpacker.cu
index ca2aa40102ee67508aec8e0221969c1a103b9b6f..9a84387adf22b7fdb4578ed4e7943850ce5fb199 100644
--- a/psrdada_cpp/common/src/unpacker.cu
+++ b/psrdada_cpp/common/src/unpacker.cu
@@ -103,8 +103,48 @@ void unpack_edd_10bit_to_float32(uint64_t const* __restrict__ in, float* __restr
 }
 
 
-
-
+__global__
+void unpack_edd_12bit_to_float32_non512(ulong3 const* __restrict__ in, float* __restrict__ out, int n)
+{
+    /**
+     * Note: This kernel has significantly worse performance than the other 12-bit unpacking kernel
+             but has the benefit that it works for arbitrary input sizes
+     */
+    float sout[16];
+    uint64_t val;
+    uint64_t rest;
+    for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x)
+    {
+        ulong3 value = in[idx];
+        val  = swap64(value.x);
+        sout[0] = (float)((int64_t)(( 0xFFF0000000000000 & val) <<  0) >> 52);
+        sout[1] = (float)((int64_t)(( 0x000FFF0000000000 & val) << 12) >> 52);
+        sout[2] = (float)((int64_t)(( 0x000000FFF0000000 & val) << 24) >> 52);
+        sout[3] = (float)((int64_t)(( 0x000000000FFF0000 & val) << 36) >> 52);
+        sout[4] = (float)((int64_t)(( 0x000000000000FFF0 & val) << 48) >> 52);
+        rest    =                   ( 0x000000000000000F & val) << 60;
+        val  = swap64(value.y);
+        sout[5] = (float)((int64_t)((( 0xFF00000000000000 & val) >> 4) | rest) >> 52);
+        sout[6] = (float)((int64_t)((  0x00FFF00000000000 & val) << 8)  >> 52);
+        sout[7] = (float)((int64_t)((  0x00000FFF00000000 & val) << 20) >> 52);
+        sout[8] = (float)((int64_t)((  0x00000000FFF00000 & val) << 32) >> 52);
+        sout[9] = (float)((int64_t)((  0x00000000000FFF00 & val) << 44) >> 52);
+        rest    =                   (  0x00000000000000FF & val) << 56;
+        val  = swap64(value.z);
+        sout[10] = (float)((int64_t)((( 0xF000000000000000 & val) >>  8) | rest) >> 52);
+        sout[11] = (float)((int64_t)((  0x0FFF000000000000 & val) <<  4) >> 52);
+        sout[12] = (float)((int64_t)((  0x0000FFF000000000 & val) << 16) >> 52);
+        sout[13] = (float)((int64_t)((  0x0000000FFF000000 & val) << 28) >> 52);
+        sout[14] = (float)((int64_t)((  0x0000000000FFF000 & val) << 40) >> 52);
+        sout[15] = (float)((int64_t)((  0x0000000000000FFF & val) << 52) >> 52);
+        
+        const std::size_t output_idx = idx * 16;
+        for (int ii = 0; ii < 16; ++ii)
+        {
+            out[output_idx + ii] = sout[ii];
+        }
+    }
+}
 
 __global__
 void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restrict__ out, int n)
@@ -112,25 +152,26 @@ void unpack_edd_12bit_to_float32(uint64_t const* __restrict__ in, float* __restr
     /**
      * Note: This kernels will not work with more than 512 threads.
      */
-    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16];
-    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3];
+    __shared__ volatile float tmp_out[EDD_NTHREADS_UNPACK * 16]; // 8192 floats 
+    __shared__ volatile uint64_t tmp_in[EDD_NTHREADS_UNPACK * 3]; // 1536 uint64_t
     int block_idx = blockIdx.x;
     uint64_t val;
     uint64_t rest;
     volatile float* sout = tmp_out + (16 * threadIdx.x);
     // We search for the next 512 multiple of size n to utilize all threads in the outer
     // loop. If we set (3 * idx + 2) < n, the kernel only works for n % 512 = 0 && n % 3 = 0
-    int next_multiple = ((n >> 9) + 1) << 9;
+    int next_multiple = ((n >> 9) + 1) << 9; // rounds up to next 512
     for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
         (3 * idx + 2) < next_multiple;
         idx+=gridDim.x*blockDim.x)
     {
+
         if(3 * idx < n)
         {
             //Read to shared memeory
             int block_read_start = block_idx * EDD_NTHREADS_UNPACK * 3;
-            tmp_in[threadIdx.x]                = in[block_read_start + threadIdx.x];
-            tmp_in[EDD_NTHREADS_UNPACK + threadIdx.x]     = in[block_read_start + EDD_NTHREADS_UNPACK + threadIdx.x];
+            tmp_in[EDD_NTHREADS_UNPACK * 0 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 0 + threadIdx.x];
+            tmp_in[EDD_NTHREADS_UNPACK * 1 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 1 + threadIdx.x];
             tmp_in[EDD_NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + EDD_NTHREADS_UNPACK * 2 + threadIdx.x];
         }
         __syncthreads();
@@ -225,18 +266,28 @@ Unpacker::~Unpacker()
 template <>
 void Unpacker::unpack<12>(const uint64_t* input, float* output, size_t size)
 {
-    assert(size % 3 == 0);
+
+    if (size % 3 != 0) {
+        throw std::invalid_argument("Input size must be a multiple of 3: size = " + std::to_string(size));
+    }
     BOOST_LOG_TRIVIAL(debug) << "  - Unpacking 12-bit data";
-    int nblocks = size / EDD_NTHREADS_UNPACK;
-    kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
-            input, output, size);
+    if (size % EDD_NTHREADS_UNPACK == 0) {
+        int nblocks = size / EDD_NTHREADS_UNPACK;
+        kernels::unpack_edd_12bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
+                input, output, size);
+    } else {
+        int nthreads = 1024;
+        int nblocks = ((size/3) % nthreads == 0) ? (size/3) / nthreads : (size/3) / nthreads + 1;
+        kernels::unpack_edd_12bit_to_float32_non512<<< nblocks, nthreads, 0, _stream>>>(
+                reinterpret_cast<const ulong3*>(input), output, size/3);
+    }        
 }
 
 template <>
 void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
 {
     BOOST_LOG_TRIVIAL(debug) << "  - Unpacking 8-bit data";
-    int nblocks = size / EDD_NTHREADS_UNPACK;
+    int nblocks = (size % EDD_NTHREADS_UNPACK == 0) ? size / EDD_NTHREADS_UNPACK : size / EDD_NTHREADS_UNPACK + 1;
     kernels::unpack_edd_8bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK, 0, _stream>>>(
             input, output, size);
 }
@@ -244,9 +295,11 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
 template <>
 void Unpacker::unpack<10>(const uint64_t* input, float* output, size_t size)
 {
-    assert(size % 5 == 0);
+    if (size % 5 != 0) {
+        throw std::invalid_argument("Input size must be a multiple of 5: size = " + std::to_string(size));
+    }
     BOOST_LOG_TRIVIAL(debug) << "  - Unpacking 10-bit data";
-    int nblocks = size / EDD_NTHREADS_UNPACK10;
+    int nblocks = (size % EDD_NTHREADS_UNPACK10 == 0) ? size / EDD_NTHREADS_UNPACK10 : size / EDD_NTHREADS_UNPACK10 + 1;
     kernels::unpack_edd_10bit_to_float32<<< nblocks, EDD_NTHREADS_UNPACK10, 0, _stream>>>(
             input, output, size);
 }
diff --git a/psrdada_cpp/common/test/src/unpacker_tester.cu b/psrdada_cpp/common/test/src/unpacker_tester.cu
index 72d07e401ac33b5689d1b9affb9129c4dc4a9b1e..8c58db09a18916228512a8afd07672e2d3069b7e 100644
--- a/psrdada_cpp/common/test/src/unpacker_tester.cu
+++ b/psrdada_cpp/common/test/src/unpacker_tester.cu
@@ -226,6 +226,7 @@ TEST_F(UnpackerTester, test_8bit_unpacking_with_type_casting)
 
     Unpacker unpacker(_stream);
     unpacker.unpack<8>(idata, odata);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     Unpacker::OutputType h_odata = odata;
     compare_against_host(h_odata, reference);
 }
@@ -245,6 +246,7 @@ TEST_F(UnpackerTester, test_8bit_unpacking_with_type_casting_non_512_multiple)
 
     Unpacker unpacker(_stream);
     unpacker.unpack<8>(idata, odata);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     Unpacker::OutputType h_odata = odata;
     compare_against_host(h_odata, reference);
 }
@@ -259,13 +261,14 @@ TEST_F(UnpackerTester, 12_bit_unpack_test)
     OutputType host_output;
     Unpacker unpacker(_stream);
     unpacker.unpack<12>(gpu_input, gpu_output);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     unpacker_12_to_32_c_reference(host_input, host_output);
     compare_against_host(gpu_output, host_output);
 }
 
 TEST_F(UnpackerTester, 12_bit_unpack_test_non_512_multiple)
 {
-    std::size_t n = 1017*3;
+    std::size_t n = 1017 * 3;
     InputType host_input = testing_tools::random_vector<uint64_t>(1, 1<<31, n);
     Unpacker::InputType gpu_input = host_input;
     Unpacker::OutputType gpu_output;
@@ -273,6 +276,7 @@ TEST_F(UnpackerTester, 12_bit_unpack_test_non_512_multiple)
     OutputType host_output;
     Unpacker unpacker(_stream);
     unpacker.unpack<12>(gpu_input, gpu_output);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     unpacker_12_to_32_c_reference(host_input, host_output);
     compare_against_host(gpu_output, host_output);
 }
@@ -296,6 +300,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
     OutputType host_output;
     Unpacker unpacker(_stream);
     unpacker.unpack<8>(gpu_input, gpu_output);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     unpacker_8_to_32_c_reference(host_input, host_output);
     compare_against_host(gpu_output, host_output);
 }
@@ -310,6 +315,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test_non_512_multiple)
     OutputType host_output;
     Unpacker unpacker(_stream);
     unpacker.unpack<8>(gpu_input, gpu_output);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     unpacker_8_to_32_c_reference(host_input, host_output);
     compare_against_host(gpu_output, host_output);
 }
@@ -333,6 +339,7 @@ TEST_F(UnpackerTester, 10_bit_unpack_test)
     OutputType host_output;
     Unpacker unpacker(_stream);
     unpacker.unpack<10>(gpu_input, gpu_output);
+    CUDA_ERROR_CHECK(cudaStreamSynchronize(_stream));
     unpacker_10_to_32_c_reference(host_input, host_output);
     compare_against_host(gpu_output, host_output);
 }