diff --git a/psrdada_cpp/effelsberg/edd/Packer.cuh b/psrdada_cpp/effelsberg/edd/Packer.cuh
index f5607de63662f7318311b14601c8febafeaeaad6..e0fb321511db5790367b49426938f6b07faadc9a 100644
--- a/psrdada_cpp/effelsberg/edd/Packer.cuh
+++ b/psrdada_cpp/effelsberg/edd/Packer.cuh
@@ -11,6 +11,44 @@ namespace edd {
 namespace kernels {
 
 
+// convert a float to an int32 clipped to minv, maxv and with a maxium
+// bit_depth. For an input_bit_depth of 2 and 4 the loop is faster than fmin,
+// fmax
+template <unsigned int input_bit_depth>
+__device__ __forceinline__ uint32_t convert32(float inp, float maxV, float minV, float level)
+{
+  uint32_t p = 0;
+  #pragma unroll
+  for (int k = 1; k < (1 << input_bit_depth); k++) {
+    p += (inp > ((k * level) + minV));
+  } // this is more efficient than fmin, fmax for clamp and cast.
+  return p;
+}
+
+template <>
+__device__ __forceinline__ uint32_t convert32<8>(float inp, float maxV, float minV, float level)
+{
+  inp -= minV;
+  inp /= level;
+  inp = fminf(inp, ((1 << 8)- 1));
+  inp = fmaxf(inp, 0);
+  uint32_t p = uint32_t (inp);
+  return p;
+}
+
+template <>
+__device__ __forceinline__ uint32_t convert32<16>(float inp, float maxV, float minV, float level)
+{
+  inp -= minV;
+  inp /= level;
+  inp = fminf(inp, ((1 << 16)- 1));
+  inp = fmaxf(inp, 0);
+  uint32_t p = uint32_t (inp);
+  return p;
+}
+
+
+
 // pack float to 2,4,8,16 bit integers with linear scaling
 template <unsigned int input_bit_depth>
 __global__ void packNbit(const float *__restrict__ input,
@@ -32,12 +70,7 @@ __global__ void packNbit(const float *__restrict__ input,
       // Load new input value, clip and convert to Nbit integer
       const float inp = input[i + j * blockDim.x];
 
-      uint32_t p = 0;
-      #pragma unroll
-      for (int k = 1; k < (1 << input_bit_depth); k++) {
-        p += (inp > ((k * l) + minV));
-      } // this is more efficient than fmin, fmax for clamp and cast.
-
+      uint32_t p = convert32<input_bit_depth>(inp, maxV, minV, l);
       // store in shared memory with linear access
       tmp[threadIdx.x] += p << (input_bit_depth * j);
     }
@@ -64,6 +97,7 @@ __global__ void packNbit(const float *__restrict__ input,
     __syncthreads();
   }
 }
+
 } // namespace kernels
 
 
diff --git a/psrdada_cpp/effelsberg/edd/src/VLBI_cli.cu b/psrdada_cpp/effelsberg/edd/src/VLBI_cli.cu
index 8c92df1365f002ebb0e21db79a4062d996377db7..dfe82c39371e0df23d5a09bc4d4b82e15f063c08 100644
--- a/psrdada_cpp/effelsberg/edd/src/VLBI_cli.cu
+++ b/psrdada_cpp/effelsberg/edd/src/VLBI_cli.cu
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
     vdifHeader.setThreadId(thread_id);
     vdifHeader.setStationId(station_id);
     BOOST_LOG_TRIVIAL(warning) << "SETTING FIXED REFERENCE EPOCH AND SECONDS FROM EPOCH!! Should be read from data stream!!";
-    vdifHeader.setReferenceEpoch(123);
+    vdifHeader.setReferenceEpoch(0);
     vdifHeader.setSecondsFromReferenceEpoch(42); // for first block
 
 
diff --git a/psrdada_cpp/effelsberg/edd/test/src/PackerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/PackerTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cc5eac9f76dfb49d9ef1e4432981ec8843808b95
--- /dev/null
+++ b/psrdada_cpp/effelsberg/edd/test/src/PackerTest.cu
@@ -0,0 +1,142 @@
+#include <time.h>
+#include <stdlib.h>
+#include "psrdada_cpp/effelsberg/edd/Packer.cuh"
+
+#include "gtest/gtest.h"
+
+//TEST(PackerTest, check2bit)
+//{
+//    std::size_t n = 1024;
+//    thrust::device_vector<float>  input(n);
+//    thrust::device_vector<uint32_t>  output(n);
+//
+//    {
+//      float minV = -2;
+//      float maxV = 2;
+//
+//      srand (time(NULL));
+//      for (int i =0; i < input.size(); i++)
+//      {
+//        input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV;
+//      }
+//
+//      thrust::fill(output.begin(), output.end(), 5);
+//      cudaStream_t stream;
+//      cudaStreamCreate(&stream);
+//      psrdada_cpp::effelsberg::edd::pack<2>(input, output, minV, maxV, stream);
+//      EXPECT_EQ(output.size(), n / 16);
+//
+//      float step = (maxV - minV) / 3;
+//      float L2 = minV + step;
+//      float L3 = minV + 2 * step;
+//      float L4 = minV + 3 * step;
+//
+//      const size_t nbp = 16; // 16 samples per output value
+//      for(int i = 0; i < input.size() / nbp; i++)
+//      {
+//          uint32_t of = output[i];
+//          for (size_t j =0; j< nbp; j++)
+//          {
+//            uint32_t a = ((of >> (j *2)) & 3);
+//            int k = i * nbp + j;
+//            if (input[k] >= L4)
+//              EXPECT_EQ(a, 3);
+//            else if (input[k] >= L3)
+//              EXPECT_EQ(a, 2);
+//            else if (input[k] >= L2)
+//              EXPECT_EQ(a, 1);
+//            else
+//              EXPECT_EQ(a, 0);
+//          }
+//      }
+//    }
+//}
+
+class PackerTest: public ::testing::Test
+{
+  protected:
+    thrust::device_vector<float>  input;
+    thrust::device_vector<uint32_t>  output; 
+    float minV;
+    float maxV;
+    cudaStream_t stream;
+
+    void SetUp() override {
+      input.resize(1024);
+      minV = -2;
+      maxV = 2;
+
+      srand (time(NULL));
+      for (int i =0; i < input.size(); i++)
+      {
+        input[i] = ((float(rand()) / RAND_MAX) - 0.5) * 2.5 * (maxV-minV) + maxV + minV;
+      }
+
+      cudaStreamCreate(&stream);
+    }
+
+    void TearDown()
+    {
+      cudaStreamDestroy(stream);
+    }
+
+    void checkOutputSize(unsigned int bit_depth)
+    {
+      //SCOPED_TRACE("Input Bitdepth: " << bit_depth );
+      EXPECT_EQ(output.size(), input.size() / (32 / bit_depth));
+    }
+
+
+    void checkOutputValues(unsigned int bit_depth)
+    {
+
+      float step = (maxV - minV) /  ((1 << bit_depth) - 1);
+
+      const size_t nbp = 32 / bit_depth; 
+      for(int i = 0; i < input.size() / nbp; i++)
+      {
+          uint32_t of = output[i];
+          for (size_t j =0; j< nbp; j++)
+          {
+            uint32_t a = ((of >> (j * bit_depth)) & ((1 << bit_depth) - 1));
+            int k = i * nbp + j;
+
+            if (input[k] <= minV)
+              EXPECT_EQ(0, int (a)) << "input[ " << k << "] = " << input[k];
+            else if (input[k] >= maxV)
+              EXPECT_EQ(((1 << bit_depth) - 1), int (a)) << "input[ " << k << "] = " << input[k];
+            else
+              EXPECT_EQ(int((input[k] - minV) / step), int(a)) << "input[ " << k << "] = " << input[k];
+          }
+      }
+    }
+    };
+
+TEST_F(PackerTest, 2bit)
+{
+  psrdada_cpp::effelsberg::edd::pack<2>(input, output, minV, maxV, stream);
+  checkOutputSize(2);
+  checkOutputValues(2);
+}
+
+
+TEST_F(PackerTest, 4bit)
+{
+  psrdada_cpp::effelsberg::edd::pack<4>(input, output, minV, maxV, stream);
+  checkOutputSize(4);
+  checkOutputValues(4);
+}
+
+TEST_F(PackerTest, 8bit)
+{
+  psrdada_cpp::effelsberg::edd::pack<8>(input, output, minV, maxV, stream);
+  checkOutputSize(8);
+  checkOutputValues(8);
+}
+
+TEST_F(PackerTest, 16bit)
+{
+  psrdada_cpp::effelsberg::edd::pack<16>(input, output, minV, maxV, stream);
+  checkOutputSize(16);
+  checkOutputValues(16);
+}