From adeb3b06f6dd19750f65322643412b44ee291420 Mon Sep 17 00:00:00 2001
From: Thomas Purcell <purcell@fhi-berlin.mpg.de>
Date: Wed, 5 Aug 2020 11:28:58 +0200
Subject: [PATCH] Cross-correlation now working

can set a maximum cross correlation for selected features
---
 .../feature_space/FeatureSpace.cpp            | 39 ++++++++++++-------
 .../feature_space/FeatureSpace.hpp            | 17 ++++++++
 src/inputs/InputParser.cpp                    |  3 +-
 src/inputs/InputParser.hpp                    |  1 +
 src/python/bindings_docstring_keyed.cpp       |  4 +-
 src/python/feature_creation/FeatureSpace.cpp  |  4 ++
 6 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp
index 7796e1a8..17b74f23 100644
--- a/src/feature_creation/feature_space/FeatureSpace.cpp
+++ b/src/feature_creation/feature_space/FeatureSpace.cpp
@@ -29,6 +29,7 @@ FeatureSpace::FeatureSpace(
     int n_sis_select,
     int max_store_rung,
     int n_rung_generate,
+    double cross_corr_max,
     double min_abs_feat_val,
     double max_abs_feat_val
 ):
@@ -41,6 +42,7 @@ FeatureSpace::FeatureSpace(
     _feature_space_file("feature_space/selected_features.txt"),
     _feature_space_summary_file("feature_space/SIS_summary.txt"),
     _mpi_comm(mpi_comm),
+    _cross_cor_max(cross_corr_max),
     _l_bound(min_abs_feat_val),
     _u_bound(max_abs_feat_val),
     _max_phi(max_phi),
@@ -434,13 +436,13 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
         {
             double cur_score = scores[inds[ii]];
 
-            bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp);
+            // bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp);
 
-            // Check the feature against those selected from previous SIS iterations
-            if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid)
-                is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp);
+            // // Check the feature against those selected from previous SIS iterations
+            // if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid)
+            //     is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp);
 
-            if(is_valid)
+            if(valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), node_value_arrs::N_SELECTED - _n_sis_select + scores_sel.size()))
             {
                 if(scores_sel.size() == _n_sis_select)
                 {
@@ -472,6 +474,15 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
     }
 }
 
+bool FeatureSpace::valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel)
+{
+    double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp);
+    for(int dd = start_sel; dd < end_sel; ++dd)
+        if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10)
+            return false;
+    return true;
+}
+
 bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp)
 {
     std::transform(scores_past.begin(), scores_past.end(), scores_comp.begin(), [&cur_score](double score){return std::abs(cur_score - score);});
@@ -480,7 +491,7 @@ bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, s
     if(*std::min_element(scores_comp.begin(), scores_comp.end()) < 1e-10)
     {
         int dd = std::min_element(scores_comp.begin(), scores_comp.end()) - scores_comp.begin();
-        if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1e-10)
+        if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10)
             return false;
     }
     return true;
@@ -493,7 +504,7 @@ bool FeatureSpace::valid_score_against_current(int end_check, double* val_ptr, d
     if(*std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) < 1e-10)
     {
         int dd = std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) - scores_comp.begin();
-        if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1e-10)
+        if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10)
             return false;
     }
     return true;
@@ -530,7 +541,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
     int cur_feat_local = 0;
     double cur_score = 0.0;
 
-    while(_scores[inds[ii]] < -1.0)
+    while(_scores[inds[ii]] < -1.0000000001)
         ++ii;
 
     std::vector<double> scores_prev_sel;
@@ -542,12 +553,12 @@ void FeatureSpace::sis(std::vector<double>& prop)
 
     while((cur_feat_local != _n_sis_select) && (ii < _scores.size()))
     {
-        bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp);
-        // Check the feature against those selected from previous SIS iterations
-        if(cur_feat > 0 && is_valid)
-            is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp);
+        // bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp);
+        // // Check the feature against those selected from previous SIS iterations
+        // if(cur_feat > 0 && is_valid)
+        //     is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp);
 
-        if(is_valid)
+        if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), cur_feat + cur_feat_local))
         {
             scores_sel[cur_feat_local] = _scores[inds[ii]];
             phi_sel.push_back(_phi[inds[ii]]);
@@ -678,7 +689,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
                 // Get the n_sis_select best features (compare against features sent from other processes)
                 while((cur_feat != node_value_arrs::N_SELECTED) && (ii < sent_scores.size()))
                 {
-                    if(valid_score_against_current(cur_feat_local, sent_phi[inds[ii]]->value().data(), sent_scores[inds[ii]], scores_sel, scores_comp))
+                    if(valid_feature_against_selected(sent_phi[inds[ii]]->value().data(), cur_feat + cur_feat_local, cur_feat))
                     {
                         out_file_stream << std::setw(14) <<std::left << cur_feat << sent_phi[inds[ii]]->postfix_expr() << std::endl;
                         sum_file_stream << std::setw(14) <<std::left << cur_feat << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl;
diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp
index 6e3e383f..45cddcf6 100644
--- a/src/feature_creation/feature_space/FeatureSpace.hpp
+++ b/src/feature_creation/feature_space/FeatureSpace.hpp
@@ -55,6 +55,7 @@ class FeatureSpace
     std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
     std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
 
+    double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
     double _l_bound; //!< lower bound for absolute value of the features
     double _u_bound; //!< upper bound for absolute value of the features
 
@@ -78,6 +79,7 @@ public:
      * @param n_sis_select number of features to select during each SIS step
      * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
      * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
+     * @param cross_corr_max Maximum cross-correlation used for selecting features
      * @param min_abs_feat_val minimum absolute feature value
      * @param max_abs_feat_val maximum absolute feature value
      */
@@ -91,6 +93,7 @@ public:
         int n_sis_select=1,
         int max_store_rung=-1,
         int n_rung_generate=0,
+        double cross_corr_max=1.0,
         double min_abs_feat_val=1e-50,
         double max_abs_feat_val=1e50
     );
@@ -216,6 +219,16 @@ public:
      */
     void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
 
+    /**
+     * @brief Checks the feature to see if it is still valid against previously selected features
+     *
+     * @param val_ptr pointer to value array of the current feature
+     * @param end_sel index of the feature to stop checking
+     *
+     * @return True if the feature is still valid
+     */
+    bool valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel = 0);
+
     /**
      * @brief Check if a feature overlaps with a feature previously selected in earlier SIS iterations
      * @details Compares the projection score of the current candidate feature with all those of previously selected features (using the current prop) and
@@ -273,6 +286,7 @@ public:
          * @param n_sis_select number of features to select during each SIS step
          * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
          * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
+         * @param cross_corr_max Maximum cross-correlation used for selecting features
          * @param min_abs_feat_val minimum absolute feature value
          * @param max_abs_feat_val maximum absolute feature value
          */
@@ -285,6 +299,7 @@ public:
             int n_sis_select=1,
             int max_store_rung=-1,
             int n_rung_generate=0,
+            double cross_corr_max=1.0,
             double min_abs_feat_val=1e-50,
             double max_abs_feat_val=1e50
         );
@@ -301,6 +316,7 @@ public:
          * @param n_sis_select number of features to select during each SIS step
          * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
          * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
+         * @param cross_corr_max Maximum cross-correlation used for selecting features
          * @param min_abs_feat_val minimum absolute feature value
          * @param max_abs_feat_val maximum absolute feature value
          */
@@ -313,6 +329,7 @@ public:
             int n_sis_select=1,
             int max_store_rung=-1,
             int n_rung_generate=0,
+            double cross_corr_max=1.0,
             double min_abs_feat_val=1e-50,
             double max_abs_feat_val=1e50
         );
diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp
index 65d8cfd5..f10e5ee1 100644
--- a/src/inputs/InputParser.cpp
+++ b/src/inputs/InputParser.cpp
@@ -7,6 +7,7 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh
     _prop_key(IP.get<std::string>("property_key", "prop")),
     _task_key(IP.get<std::string>("task_key", "Task")),
     _leave_out_inds(as_vector<int>(IP, "leave_out_inds")),
+    _cross_cor_max(IP.get<double>("max_feat_cross_correlation", 1.0)),
     _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)),
     _u_bound(IP.get<double>("max_abs_feat_val", 1e50)),
     _n_dim(IP.get<int>("desc_dim")),
@@ -270,7 +271,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
     for(int ff = 0; ff < headers.size(); ++ff)
         phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff]));
 
-    _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound);
+    _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
 }
 
 void stripComments(std::string& filename)
diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp
index 08ba841e..f55e4c18 100644
--- a/src/inputs/InputParser.hpp
+++ b/src/inputs/InputParser.hpp
@@ -49,6 +49,7 @@ public:
 
     std::shared_ptr<FeatureSpace> _feat_space; //!< shared_ptr to the FeatureSpace generated from the data file and the input file
 
+    double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
     double _l_bound; //!< Minimum absolute value allowed for the feature.
     double _u_bound; //!< Maximum absolute value allowed for the feature.
 
diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp
index 17dfd0c9..38ac74e7 100644
--- a/src/python/bindings_docstring_keyed.cpp
+++ b/src/python/bindings_docstring_keyed.cpp
@@ -39,8 +39,8 @@ void sisso::feature_creation::registerFeatureSpace()
     void (FeatureSpace::*sis_list)(list) = &FeatureSpace::sis;
     void (FeatureSpace::*sis_ndarray)(np::ndarray) = &FeatureSpace::sis;
 
-    class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double>>())
-        .def(init<list, list, list, list, optional<int, int, int, int, double>>())
+    class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>())
+        .def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>())
         .def("sis", sis_list, "@DocString_feat_space_sis_list@")
         .def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@")
         .def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@")
diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp
index 16d72b1f..66502c12 100644
--- a/src/python/feature_creation/FeatureSpace.cpp
+++ b/src/python/feature_creation/FeatureSpace.cpp
@@ -9,6 +9,7 @@ FeatureSpace::FeatureSpace(
     int n_sis_select,
     int max_store_rung,
     int n_rung_generate,
+    double cross_corr_max,
     double min_abs_feat_val,
     double max_abs_feat_val
 ):
@@ -21,6 +22,7 @@ FeatureSpace::FeatureSpace(
     _feature_space_file("feature_space/selected_features.txt"),
     _feature_space_summary_file("feature_space/SIS_summary.txt"),
     _mpi_comm(mpi_setup::comm),
+    _cross_cor_max(cross_corr_max),
     _l_bound(min_abs_feat_val),
     _u_bound(max_abs_feat_val),
     _max_phi(max_phi),
@@ -42,6 +44,7 @@ FeatureSpace::FeatureSpace(
     int n_sis_select,
     int max_store_rung,
     int n_rung_generate,
+    double cross_corr_max,
     double min_abs_feat_val,
     double max_abs_feat_val
 ):
@@ -54,6 +57,7 @@ FeatureSpace::FeatureSpace(
     _feature_space_file("feature_space/selected_features.txt"),
     _feature_space_summary_file("feature_space/SIS_summary.txt"),
     _mpi_comm(mpi_setup::comm),
+    _cross_cor_max(cross_corr_max),
     _l_bound(min_abs_feat_val),
     _u_bound(max_abs_feat_val),
     _max_phi(max_phi),
-- 
GitLab