diff --git a/psrdada_cpp/effelsberg/edd/SKRfiReplacementCuda.cuh b/psrdada_cpp/effelsberg/edd/SKRfiReplacementCuda.cuh
index f806e403784b5264dff543f72d7152619b708dd0..2ab940dddefb81a5a047204b2f2ce3ef44bf9466 100644
--- a/psrdada_cpp/effelsberg/edd/SKRfiReplacementCuda.cuh
+++ b/psrdada_cpp/effelsberg/edd/SKRfiReplacementCuda.cuh
@@ -9,6 +9,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random/normal_distribution.h>
 #include <thrust/random/linear_congruential_engine.h>
+#include <nvToolsExt.h>
 
 namespace psrdada_cpp {
 namespace effelsberg {
diff --git a/psrdada_cpp/effelsberg/edd/SpectralKurtosisCuda.cuh b/psrdada_cpp/effelsberg/edd/SpectralKurtosisCuda.cuh
index f231203bfcc6c0c944042d40cf9707433f355474..ab4bdcb8c8ceec3df3b3a349fa6a30d79b0d4897 100644
--- a/psrdada_cpp/effelsberg/edd/SpectralKurtosisCuda.cuh
+++ b/psrdada_cpp/effelsberg/edd/SpectralKurtosisCuda.cuh
@@ -8,6 +8,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/execution_policy.h>
+#include <nvToolsExt.h>
 
 namespace psrdada_cpp {
 namespace effelsberg {
diff --git a/psrdada_cpp/effelsberg/edd/src/SKRfiReplacementCuda.cu b/psrdada_cpp/effelsberg/edd/src/SKRfiReplacementCuda.cu
index 7e954392c117d722dce458890494f80c231d56f7..dcfab35a42919ca39bb4a73b5078486613d9dcac 100644
--- a/psrdada_cpp/effelsberg/edd/src/SKRfiReplacementCuda.cu
+++ b/psrdada_cpp/effelsberg/edd/src/SKRfiReplacementCuda.cu
@@ -51,6 +51,7 @@ void SKRfiReplacementCuda::init()
 
 void SKRfiReplacementCuda::get_rfi_window_indices()
 {
+    nvtxRangePushA("get_rfi_window_indices");
     _nrfi_windows = thrust::count(_rfi_status.begin(), _rfi_status.end(), 1);
     _rfi_window_indices.resize(_nrfi_windows);
     thrust::copy_if(thrust::make_counting_iterator<int>(0),
@@ -58,10 +59,12 @@ void SKRfiReplacementCuda::get_rfi_window_indices()
                     _rfi_status.begin(),
                     _rfi_window_indices.begin(),
                     thrust::placeholders::_1 == 1);
+    nvtxRangePop();
 }
 
 void SKRfiReplacementCuda::get_clean_window_indices()
 {
+    nvtxRangePushA("get_clean_window_indices");
     _nclean_windows = thrust::count(_rfi_status.begin(), _rfi_status.end(), 0);
     _clean_window_indices.resize(DEFAULT_NUM_CLEAN_WINDOWS);
     thrust::copy_if(thrust::make_counting_iterator<int>(0),
@@ -69,10 +72,12 @@ void SKRfiReplacementCuda::get_clean_window_indices()
                     _rfi_status.begin(),
                     _clean_window_indices.begin(),
                     thrust::placeholders::_1 == 0);
+    nvtxRangePop();
 }
 
 void SKRfiReplacementCuda::get_clean_data_statistics(const thrust::device_vector<thrust::complex<float>> &data)
 {
+    nvtxRangePushA("get_clean_data_statistics");
     _window_size = data.size() / _nwindows;
     _clean_data.resize(DEFAULT_NUM_CLEAN_WINDOWS * _window_size);
     for(std::size_t ii = 0; ii < DEFAULT_NUM_CLEAN_WINDOWS; ii++){
@@ -84,15 +89,18 @@ void SKRfiReplacementCuda::get_clean_data_statistics(const thrust::device_vector
         BOOST_LOG_TRIVIAL(debug) <<"clean_win_index = " << window_index
                                  << " ibegin = " << ibegin << " iend = " << iend;
     }
+    nvtxRangePop();
     compute_clean_data_statistics();
 }
 
 void SKRfiReplacementCuda::compute_clean_data_statistics() 
 {
+    nvtxRangePushA("compute_clean_data_statistics");
     std::size_t length = _clean_data.size();
     _ref_mean = (thrust::reduce(_clean_data.begin(), _clean_data.end(), thrust::complex<float> (0.0f, 0.0f))). real() / length;
     _ref_sd = std::sqrt(thrust::transform_reduce(_clean_data.begin(), _clean_data.end(), mean_subtraction_square(_ref_mean),
                         0.0f, thrust::plus<float> ()) / length);
+    nvtxRangePop();
     BOOST_LOG_TRIVIAL(debug) << "DataStatistics mean = " << _ref_mean
                              << " sd =  " << _ref_sd;
 }
@@ -100,6 +108,7 @@ void SKRfiReplacementCuda::compute_clean_data_statistics()
 void SKRfiReplacementCuda::replace_rfi_data(const thrust::device_vector<int> &rfi_status,
                                             thrust::device_vector<thrust::complex<float>> &data)
 {
+    nvtxRangePushA("replace_rfi_data");
     _rfi_status = rfi_status;
     thrust::device_vector<thrust::complex<float>> replacement_data;
     //initialize data members of the class
@@ -109,12 +118,15 @@ void SKRfiReplacementCuda::replace_rfi_data(const thrust::device_vector<int> &rf
         get_clean_data_statistics(data);
 	//Replacing RFI
 	thrust::counting_iterator<unsigned int> sequence_index_begin(0);
+        nvtxRangePushA("replace_rfi_datai_loop");
 	for(std::size_t ii = 0; ii < _nrfi_windows; ii++){
             std::size_t index = _rfi_window_indices[ii] * _window_size;
             thrust::transform(sequence_index_begin, (sequence_index_begin + _window_size), 
                               (data.begin() + index), generate_replacement_data(_ref_mean, _ref_sd));
         }
+        nvtxRangePop();
     }
+    nvtxRangePop();
 }
 } //edd
 } //effelsberg
diff --git a/psrdada_cpp/effelsberg/edd/src/SpectralKurtosisCuda.cu b/psrdada_cpp/effelsberg/edd/src/SpectralKurtosisCuda.cu
index 917e661061e0d8a67177b6bde8c00e498d192d73..1bcc668cd60d636721fc9d4c055bdaeacb12a6a2 100755
--- a/psrdada_cpp/effelsberg/edd/src/SpectralKurtosisCuda.cu
+++ b/psrdada_cpp/effelsberg/edd/src/SpectralKurtosisCuda.cu
@@ -39,17 +39,6 @@ struct check_rfi{
    }
 };
 
-struct set_indices{
-    const std::size_t M; //window_size
-    set_indices(std::size_t m) : M(m) {}
-    
-    __host__ __device__
-    int operator()(int z)
-    {
-        return (z / M);
-    }
-};
-
 SpectralKurtosisCuda::SpectralKurtosisCuda(std::size_t nchannels, std::size_t window_size, float sk_min, float sk_max)
     : _nchannels(nchannels),
       _window_size(window_size),
@@ -77,6 +66,7 @@ void SpectralKurtosisCuda::init()
 }
 
 void SpectralKurtosisCuda::compute_sk(const thrust::device_vector<thrust::complex<float>> &data, RFIStatistics &stats){
+    nvtxRangePushA("compute_sk");
     _sample_size = data.size();
     BOOST_LOG_TRIVIAL(debug) << "Computing SK for sample_size " << _sample_size
                              << " and window_size " << _window_size <<".\n";
@@ -84,15 +74,15 @@ void SpectralKurtosisCuda::compute_sk(const thrust::device_vector<thrust::comple
     init();
     //computing _d_s1 for all windows
     thrust::reduce_by_key(thrust::device, 
-                          thrust::make_transform_iterator(thrust::counting_iterator<int> (0), set_indices(_window_size)),
-                          thrust::make_transform_iterator(thrust::counting_iterator<int> ((_sample_size - 1)), set_indices(_window_size)), 
+                          thrust::make_transform_iterator(thrust::counting_iterator<int> (0), (thrust::placeholders::_1 / _window_size)),
+                          thrust::make_transform_iterator(thrust::counting_iterator<int> (_sample_size - 1), (thrust::placeholders::_1 / _window_size)), 
                           thrust::make_transform_iterator(data.begin(), compute_power()), 
                           thrust::discard_iterator<int>(), 
                           _d_s1.begin());
     //computing _d_s2  for all windows
     thrust::reduce_by_key(thrust::device, 
-                          thrust::make_transform_iterator(thrust::counting_iterator<int> (0), set_indices(_window_size)),
-                          thrust::make_transform_iterator(thrust::counting_iterator<int> ((_sample_size - 1)), set_indices(_window_size)), 
+                          thrust::make_transform_iterator(thrust::counting_iterator<int> (0), (thrust::placeholders::_1 / _window_size)),
+                          thrust::make_transform_iterator(thrust::counting_iterator<int> (_sample_size - 1), (thrust::placeholders::_1 / _window_size)), 
                           thrust::make_transform_iterator(data.begin(), power_square()), 
                           thrust::discard_iterator<int>(), 
                           _d_s2.begin());
@@ -101,6 +91,7 @@ void SpectralKurtosisCuda::compute_sk(const thrust::device_vector<thrust::comple
     thrust::transform(_d_s1.begin(), _d_s1.end(), _d_s2.begin(), stats.rfi_status.begin(), check_rfi(_window_size, _sk_min, _sk_max));
     stats.rfi_fraction = thrust::reduce(stats.rfi_status.begin(), stats.rfi_status.end(), 0.0f) / _nwindows;
     BOOST_LOG_TRIVIAL(info) << "RFI fraction: " << stats.rfi_fraction;
+    nvtxRangePop();
 }
 } //edd
 } //effelsberg
diff --git a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt
index 032c1f0135cce2db5155b48b970dcfb8697536ba..e7812caf9f5cdcee1a6cf23608b5693af49f12ff 100644
--- a/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt
+++ b/psrdada_cpp/effelsberg/edd/test/CMakeLists.txt
@@ -18,6 +18,6 @@ set(
     src/SpectralKurtosisCudaTester.cu
 )
 cuda_add_executable(gtest_edd ${gtest_edd_src} )
-target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} -lcublas)
+target_link_libraries(gtest_edd ${PSRDADA_CPP_EFFELSBERG_EDD_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} -lcublas -lnvToolsExt -L/usr/local/cuda-10.1/lib64/)
 add_test(gtest_edd gtest_edd --test_data "${CMAKE_CURRENT_LIST_DIR}/data")