diff --git a/bfps/_code.py b/bfps/_code.py
index ec1557d2f68ff7bad58abbf629fcdd7a21d8f341..aff52a6e82a5bce403ee0168f89d976d11467d1f 100644
--- a/bfps/_code.py
+++ b/bfps/_code.py
@@ -353,6 +353,7 @@ class _code(_base):
                           '\n')
         script_file.write('echo "Start time is `date`"\n')
         script_file.write('cd ' + self.work_dir + '\n')
+        script_file.write('cp -s ../*.h5 ./\n')
         script_file.write('poe ' +
                 os.path.join(
                     self.work_dir,
@@ -423,6 +424,7 @@ class _code(_base):
                           '\n')
         script_file.write('echo "This is step $LOADL_STEP_ID out of {0}"\n'.format(njobs))
         script_file.write('echo "Start time is `date`"\n')
+        script_file.write('cp -s ../*.h5 ./\n')
         script_file.write('cd ' + self.work_dir + '\n')
         script_file.write('poe ' +
                 os.path.join(
diff --git a/bfps/cpp/field.cpp b/bfps/cpp/field.cpp
index aef3cb840c86b419cf834c76f4d917b2d87df60e..0d065cede5ba61519b3cfa4ddb854e9bfb3c95ad 100644
--- a/bfps/cpp/field.cpp
+++ b/bfps/cpp/field.cpp
@@ -774,6 +774,8 @@ kspace<be, dt>::kspace(
         std::fill_n(nshell_local, this->nshells, 0);
     });
 
+    std::vector<std::unordered_map<int, double>> dealias_filter_threaded(omp_get_max_threads());
+
     KSPACE_CLOOP_K2_NXMODES(
             this,[&](ptrdiff_t /*cindex*/, hsize_t /*yindex*/, hsize_t /*zindex*/, int nxmodes, hsize_t /*xindex*/, double k2){
                 if (k2 < this->kM2)
@@ -784,10 +786,16 @@ kspace<be, dt>::kspace(
                 }
                 if (dt == TWO_THIRDS){
                     // Should not be any race condition here it is a "write"
-                    this->dealias_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));
+                    dealias_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));
                 }
             });
 
+    for(int idxMerge = 0 ; idxMerge < int(dealias_filter_threaded.size()) ; ++idxMerge){
+        for(const auto kv : dealias_filter_threaded[idxMerge]){
+            this->dealias_filter[kv.first] = kv.second;
+        }
+    }
+
     nshell_local_threaded.mergeParallel();
     kshell_local_threaded.mergeParallel();
 
diff --git a/bfps/cpp/fluid_solver_base.cpp b/bfps/cpp/fluid_solver_base.cpp
index b50c7c8ce2d1be4949e3307a44566cc4b09c63cd..393f086d02676c0faf3d047cc938b4befea64b93 100644
--- a/bfps/cpp/fluid_solver_base.cpp
+++ b/bfps/cpp/fluid_solver_base.cpp
@@ -513,6 +513,8 @@ fluid_solver_base<rnumber>::fluid_solver_base(
     int64_t *nshell_local = new int64_t[this->nshells];
     std::fill_n(nshell_local, this->nshells, 0.0);
 
+    std::vector<std::unordered_map<int, double>> Fourier_filter_threaded(omp_get_max_threads());
+
     CLOOP_K2_NXMODES(
                 this,
 
@@ -524,9 +526,16 @@ fluid_solver_base<rnumber>::fluid_solver_base(
             nshell_local[int(knorm/this->dk)] += nxmodes;
             kshell_local[int(knorm/this->dk)] += nxmodes*knorm;
         }
-        this->Fourier_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));}
+        Fourier_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));}
     );
 
+    // Merge results
+    for(int idxMerge = 0 ; idxMerge < int(Fourier_filter_threaded.size()) ; ++idxMerge){
+        for(const auto kv : Fourier_filter_threaded[idxMerge]){
+            this->Fourier_filter[kv.first] = kv.second;
+        }
+    }
+
     MPI_Allreduce(
                 (void*)(nshell_local),
                 (void*)(this->nshell),
diff --git a/bfps/cpp/scope_timer.hpp b/bfps/cpp/scope_timer.hpp
index 5fd6578c9e9f593191a44049c4f425448a53b21a..76fc69f16f1772aca3242756fcbaf58acf0eed29 100644
--- a/bfps/cpp/scope_timer.hpp
+++ b/bfps/cpp/scope_timer.hpp
@@ -37,7 +37,7 @@
 #include <mpi.h>
 #include <cstring>
 #include <stdexcept>
-
+#include <fstream>
 #include "base.hpp"
 #include "bfps_timer.hpp"
 
@@ -185,10 +185,10 @@ public:
         retMpi = MPI_Comm_size( inComm, &nbProcess);
         assert(retMpi == MPI_SUCCESS);
 
-        if((&outputStream == &std::cout || &outputStream == &std::clog) && myrank != nbProcess-1){
+        if((&outputStream == &std::cout || &outputStream == &std::clog) && myRank != nbProcess-1){
             // Print in reverse order
             char tmp;
-            retMpi = MPI_Recv(&tmp, 1, MPI_BYTE, myrank+1, 99, inComm, MPI_STATUS_IGNORE);
+            retMpi = MPI_Recv(&tmp, 1, MPI_BYTE, myRank+1, 99, inComm, MPI_STATUS_IGNORE);
             assert(retMpi == MPI_SUCCESS);
         }
         outputStream.flush();
@@ -230,10 +230,10 @@ public:
         }
         outputStream.flush();
 
-        if((&outputStream == &std::cout || &outputStream == &std::clog) && myrank != 0){
+        if((&outputStream == &std::cout || &outputStream == &std::clog) && myRank != 0){
             // Print in reverse order
             char tmp;
-            retMpi = MPI_Send(&tmp, 1, MPI_BYTE, myrank-1, 99, inComm);
+            retMpi = MPI_Send(&tmp, 1, MPI_BYTE, myRank-1, 99, inComm);
             assert(retMpi == MPI_SUCCESS);
         }
     }
@@ -283,7 +283,7 @@ public:
             }
         }
 
-        if(myrank != 0){
+        if(myRank != 0){
             const std::string strOutput = myResults.str();
             int sizeOutput = strOutput.length();
             retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm);
@@ -444,6 +444,153 @@ public:
         outputStream.flush();
     }
 
+    void showHtml(const MPI_Comm inComm) const {
+        int myRank, nbProcess;
+        int retMpi = MPI_Comm_rank( inComm, &myRank);
+        assert(retMpi == MPI_SUCCESS);
+        retMpi = MPI_Comm_size( inComm, &nbProcess);
+        assert(retMpi == MPI_SUCCESS);
+
+        std::stringstream myResults;
+
+        std::stack<std::pair<int, const std::shared_ptr<CoreEvent>>> events;
+
+        for (int idx = static_cast<int>(root->getChildren().size()) - 1; idx >= 0; --idx) {
+            events.push({0, root->getChildren()[idx]});
+        }
+
+        myResults << "<h1>Process : " << myRank << "</h1>\n";
+
+        double totalDuration = 0;
+        for (int idx =
+             static_cast<int>(root->getChildren().size()) - 1;
+             idx >= 0; --idx) {
+            totalDuration += root->getChildren()[idx]->getDuration();
+        }
+
+        myResults << "<h2> " << root->getName() << " (" << totalDuration << "s)</h2>\n";
+        myResults << "<ul>\n";
+        int idxBox = myRank*100000;
+
+        while (events.size()) {
+            const std::pair<int, const std::shared_ptr<CoreEvent>> eventToShow =
+                    events.top();
+            events.pop();
+
+            if(eventToShow.first == -1){
+                myResults << "</ul>\n";
+                myResults << "</li>\n";
+            }
+            else if(eventToShow.second->getChildren().size() == 0){
+                myResults << "<li>&#9679; <span title=\"";
+                if (eventToShow.second->getOccurrence() != 1) {
+                    myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                                 << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                                 << eventToShow.second->getOccurrence();
+                }
+                myResults << "\">" << eventToShow.second->getName();
+                myResults << " (" << 100*eventToShow.second->getDuration()/totalDuration << "% -- " ;
+                myResults << eventToShow.second->getDuration() <<"s)</span></li>\n";
+            }
+            else{
+                myResults << "<li><input type=\"checkbox\" id=\"c" << idxBox << "\" />\n";
+                myResults << "  <i class=\"fa fa-angle-double-right\">&rarr; </i>\n";
+                myResults << "  <i class=\"fa fa-angle-double-down\">&darr; </i>\n";
+                myResults << "  <label for=\"c" << idxBox++ << "\"><span title=\"";
+                if (eventToShow.second->getOccurrence() != 1) {
+                    myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                                 << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                                 << eventToShow.second->getOccurrence();
+                }
+                myResults << "\">" << eventToShow.second->getName();
+                myResults << " (" << 100*eventToShow.second->getDuration()/totalDuration << "% -- " ;
+                myResults << eventToShow.second->getDuration() <<"s)</span></label>\n";
+                myResults << "<ul>\n";
+                events.push({-1, std::shared_ptr<CoreEvent>()});
+
+                for (int idx =
+                     static_cast<int>(eventToShow.second->getChildren().size()) - 1;
+                     idx >= 0; --idx) {
+                    events.push(
+                    {eventToShow.first + 1, eventToShow.second->getChildren()[idx]});
+                }
+            }
+        }
+
+        myResults << "</ul>\n";
+
+        if(myRank != 0){
+            const std::string strOutput = myResults.str();
+            int sizeOutput = strOutput.length();
+            retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm);
+            assert(retMpi == MPI_SUCCESS);
+            retMpi = MPI_Send((void*)strOutput.data(), sizeOutput, MPI_CHAR, 0, 100, inComm);
+            assert(retMpi == MPI_SUCCESS);
+        }
+        else{
+            const std::string htmlOutput = (getenv("HTMLOUTPUT")?getenv("HTMLOUTPUT"):"timings.html");
+
+            std::cout << "Timing output html set to : " << htmlOutput << std::endl;
+
+            std::ofstream htmlfile(htmlOutput);
+
+            htmlfile << "<html>\
+                        <head>\
+                        <style>\
+                        input {\
+                          display: none;\
+                        }\
+                        input ~ ul {\
+                         display: none;\
+                        }\
+                        input:checked ~ ul {\
+                         display: block;\
+                        }\
+                        input ~ .fa-angle-double-down {\
+                          display: none;\
+                        }\
+                        input:checked ~ .fa-angle-double-right {\
+                          display: none;\
+                        }\
+                        input:checked ~ .fa-angle-double-down {\
+                          display: inline;\
+                        }\
+                        li {\
+                          display: block;\
+                          font-family: 'Arial';\
+                          font-size: 15px;\
+                          padding: 0.2em;\
+                          border: 1px solid transparent;\
+                        }\
+                        li:hover {\
+                          border: 1px solid grey;\
+                          border-radius: 3px;\
+                          background-color: lightgrey;\
+                        }\
+                        span:hover {\
+                            color: blue;\
+                        }\
+                        </style>\
+                        </head>\
+                        <body>";
+
+            std::vector<char> buffer;
+            for(int idxProc = nbProcess-1 ; idxProc > 0 ; --idxProc){
+                int sizeRecv;
+                retMpi = MPI_Recv(&sizeRecv, 1, MPI_INT, idxProc, 99, inComm, MPI_STATUS_IGNORE);
+                assert(retMpi == MPI_SUCCESS);
+                buffer.resize(sizeRecv+1);
+                retMpi = MPI_Recv(buffer.data(), sizeRecv, MPI_CHAR, idxProc, 100, inComm, MPI_STATUS_IGNORE);
+                assert(retMpi == MPI_SUCCESS);
+                buffer[sizeRecv]='\0';
+                htmlfile << buffer.data();
+            }
+            htmlfile << myResults.str();
+            htmlfile << "</body>\
+                        </html>";
+        }
+    }
+
     friend scope_timer;
 };